Version bumps after new release
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEw_VdwLJ_GeomW4W4_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water4-Water4
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEw_VdwLJ_GeomW4W4_VF_avx_256_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrE,jnrF,jnrG,jnrH;
78     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
83     real             rcutoff_scalar;
84     real             *shiftvec,*fshift,*x,*f;
85     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
86     real             scratch[4*DIM];
87     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88     real *           vdwioffsetptr0;
89     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90     real *           vdwioffsetptr1;
91     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92     real *           vdwioffsetptr2;
93     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94     real *           vdwioffsetptr3;
95     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
96     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
97     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
98     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
99     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
100     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
101     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
102     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
103     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
104     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
105     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
106     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
107     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
108     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
109     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
110     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
111     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
112     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
113     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
114     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
115     real             *charge;
116     int              nvdwtype;
117     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
118     int              *vdwtype;
119     real             *vdwparam;
120     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
121     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
122     __m256i          ewitab;
123     __m128i          ewitab_lo,ewitab_hi;
124     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
125     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
126     real             *ewtab;
127     __m256           dummy_mask,cutoff_mask;
128     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
129     __m256           one     = _mm256_set1_ps(1.0);
130     __m256           two     = _mm256_set1_ps(2.0);
131     x                = xx[0];
132     f                = ff[0];
133
134     nri              = nlist->nri;
135     iinr             = nlist->iinr;
136     jindex           = nlist->jindex;
137     jjnr             = nlist->jjnr;
138     shiftidx         = nlist->shift;
139     gid              = nlist->gid;
140     shiftvec         = fr->shift_vec[0];
141     fshift           = fr->fshift[0];
142     facel            = _mm256_set1_ps(fr->epsfac);
143     charge           = mdatoms->chargeA;
144     nvdwtype         = fr->ntype;
145     vdwparam         = fr->nbfp;
146     vdwtype          = mdatoms->typeA;
147
148     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
149     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
150     beta2            = _mm256_mul_ps(beta,beta);
151     beta3            = _mm256_mul_ps(beta,beta2);
152
153     ewtab            = fr->ic->tabq_coul_FDV0;
154     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
155     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
156
157     /* Setup water-specific parameters */
158     inr              = nlist->iinr[0];
159     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
160     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
161     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
162     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
163
164     jq1              = _mm256_set1_ps(charge[inr+1]);
165     jq2              = _mm256_set1_ps(charge[inr+2]);
166     jq3              = _mm256_set1_ps(charge[inr+3]);
167     vdwjidx0A        = 2*vdwtype[inr+0];
168     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
169     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
170     qq11             = _mm256_mul_ps(iq1,jq1);
171     qq12             = _mm256_mul_ps(iq1,jq2);
172     qq13             = _mm256_mul_ps(iq1,jq3);
173     qq21             = _mm256_mul_ps(iq2,jq1);
174     qq22             = _mm256_mul_ps(iq2,jq2);
175     qq23             = _mm256_mul_ps(iq2,jq3);
176     qq31             = _mm256_mul_ps(iq3,jq1);
177     qq32             = _mm256_mul_ps(iq3,jq2);
178     qq33             = _mm256_mul_ps(iq3,jq3);
179
180     /* Avoid stupid compiler warnings */
181     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
182     j_coord_offsetA = 0;
183     j_coord_offsetB = 0;
184     j_coord_offsetC = 0;
185     j_coord_offsetD = 0;
186     j_coord_offsetE = 0;
187     j_coord_offsetF = 0;
188     j_coord_offsetG = 0;
189     j_coord_offsetH = 0;
190
191     outeriter        = 0;
192     inneriter        = 0;
193
194     for(iidx=0;iidx<4*DIM;iidx++)
195     {
196         scratch[iidx] = 0.0;
197     }
198
199     /* Start outer loop over neighborlists */
200     for(iidx=0; iidx<nri; iidx++)
201     {
202         /* Load shift vector for this list */
203         i_shift_offset   = DIM*shiftidx[iidx];
204
205         /* Load limits for loop over neighbors */
206         j_index_start    = jindex[iidx];
207         j_index_end      = jindex[iidx+1];
208
209         /* Get outer coordinate index */
210         inr              = iinr[iidx];
211         i_coord_offset   = DIM*inr;
212
213         /* Load i particle coords and add shift vector */
214         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
215                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
216
217         fix0             = _mm256_setzero_ps();
218         fiy0             = _mm256_setzero_ps();
219         fiz0             = _mm256_setzero_ps();
220         fix1             = _mm256_setzero_ps();
221         fiy1             = _mm256_setzero_ps();
222         fiz1             = _mm256_setzero_ps();
223         fix2             = _mm256_setzero_ps();
224         fiy2             = _mm256_setzero_ps();
225         fiz2             = _mm256_setzero_ps();
226         fix3             = _mm256_setzero_ps();
227         fiy3             = _mm256_setzero_ps();
228         fiz3             = _mm256_setzero_ps();
229
230         /* Reset potential sums */
231         velecsum         = _mm256_setzero_ps();
232         vvdwsum          = _mm256_setzero_ps();
233
234         /* Start inner kernel loop */
235         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
236         {
237
238             /* Get j neighbor index, and coordinate index */
239             jnrA             = jjnr[jidx];
240             jnrB             = jjnr[jidx+1];
241             jnrC             = jjnr[jidx+2];
242             jnrD             = jjnr[jidx+3];
243             jnrE             = jjnr[jidx+4];
244             jnrF             = jjnr[jidx+5];
245             jnrG             = jjnr[jidx+6];
246             jnrH             = jjnr[jidx+7];
247             j_coord_offsetA  = DIM*jnrA;
248             j_coord_offsetB  = DIM*jnrB;
249             j_coord_offsetC  = DIM*jnrC;
250             j_coord_offsetD  = DIM*jnrD;
251             j_coord_offsetE  = DIM*jnrE;
252             j_coord_offsetF  = DIM*jnrF;
253             j_coord_offsetG  = DIM*jnrG;
254             j_coord_offsetH  = DIM*jnrH;
255
256             /* load j atom coordinates */
257             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
258                                                  x+j_coord_offsetC,x+j_coord_offsetD,
259                                                  x+j_coord_offsetE,x+j_coord_offsetF,
260                                                  x+j_coord_offsetG,x+j_coord_offsetH,
261                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
262                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
263
264             /* Calculate displacement vector */
265             dx00             = _mm256_sub_ps(ix0,jx0);
266             dy00             = _mm256_sub_ps(iy0,jy0);
267             dz00             = _mm256_sub_ps(iz0,jz0);
268             dx11             = _mm256_sub_ps(ix1,jx1);
269             dy11             = _mm256_sub_ps(iy1,jy1);
270             dz11             = _mm256_sub_ps(iz1,jz1);
271             dx12             = _mm256_sub_ps(ix1,jx2);
272             dy12             = _mm256_sub_ps(iy1,jy2);
273             dz12             = _mm256_sub_ps(iz1,jz2);
274             dx13             = _mm256_sub_ps(ix1,jx3);
275             dy13             = _mm256_sub_ps(iy1,jy3);
276             dz13             = _mm256_sub_ps(iz1,jz3);
277             dx21             = _mm256_sub_ps(ix2,jx1);
278             dy21             = _mm256_sub_ps(iy2,jy1);
279             dz21             = _mm256_sub_ps(iz2,jz1);
280             dx22             = _mm256_sub_ps(ix2,jx2);
281             dy22             = _mm256_sub_ps(iy2,jy2);
282             dz22             = _mm256_sub_ps(iz2,jz2);
283             dx23             = _mm256_sub_ps(ix2,jx3);
284             dy23             = _mm256_sub_ps(iy2,jy3);
285             dz23             = _mm256_sub_ps(iz2,jz3);
286             dx31             = _mm256_sub_ps(ix3,jx1);
287             dy31             = _mm256_sub_ps(iy3,jy1);
288             dz31             = _mm256_sub_ps(iz3,jz1);
289             dx32             = _mm256_sub_ps(ix3,jx2);
290             dy32             = _mm256_sub_ps(iy3,jy2);
291             dz32             = _mm256_sub_ps(iz3,jz2);
292             dx33             = _mm256_sub_ps(ix3,jx3);
293             dy33             = _mm256_sub_ps(iy3,jy3);
294             dz33             = _mm256_sub_ps(iz3,jz3);
295
296             /* Calculate squared distance and things based on it */
297             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
298             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
299             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
300             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
301             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
302             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
303             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
304             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
305             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
306             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
307
308             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
309             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
310             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
311             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
312             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
313             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
314             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
315             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
316             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
317
318             rinvsq00         = gmx_mm256_inv_ps(rsq00);
319             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
320             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
321             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
322             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
323             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
324             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
325             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
326             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
327             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
328
329             fjx0             = _mm256_setzero_ps();
330             fjy0             = _mm256_setzero_ps();
331             fjz0             = _mm256_setzero_ps();
332             fjx1             = _mm256_setzero_ps();
333             fjy1             = _mm256_setzero_ps();
334             fjz1             = _mm256_setzero_ps();
335             fjx2             = _mm256_setzero_ps();
336             fjy2             = _mm256_setzero_ps();
337             fjz2             = _mm256_setzero_ps();
338             fjx3             = _mm256_setzero_ps();
339             fjy3             = _mm256_setzero_ps();
340             fjz3             = _mm256_setzero_ps();
341
342             /**************************
343              * CALCULATE INTERACTIONS *
344              **************************/
345
346             /* LENNARD-JONES DISPERSION/REPULSION */
347
348             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
349             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
350             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
351             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
352             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
353
354             /* Update potential sum for this i atom from the interaction with this j atom. */
355             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
356
357             fscal            = fvdw;
358
359             /* Calculate temporary vectorial force */
360             tx               = _mm256_mul_ps(fscal,dx00);
361             ty               = _mm256_mul_ps(fscal,dy00);
362             tz               = _mm256_mul_ps(fscal,dz00);
363
364             /* Update vectorial force */
365             fix0             = _mm256_add_ps(fix0,tx);
366             fiy0             = _mm256_add_ps(fiy0,ty);
367             fiz0             = _mm256_add_ps(fiz0,tz);
368
369             fjx0             = _mm256_add_ps(fjx0,tx);
370             fjy0             = _mm256_add_ps(fjy0,ty);
371             fjz0             = _mm256_add_ps(fjz0,tz);
372
373             /**************************
374              * CALCULATE INTERACTIONS *
375              **************************/
376
377             r11              = _mm256_mul_ps(rsq11,rinv11);
378
379             /* EWALD ELECTROSTATICS */
380             
381             /* Analytical PME correction */
382             zeta2            = _mm256_mul_ps(beta2,rsq11);
383             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
384             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
385             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
386             felec            = _mm256_mul_ps(qq11,felec);
387             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
388             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
389             velec            = _mm256_sub_ps(rinv11,pmecorrV);
390             velec            = _mm256_mul_ps(qq11,velec);
391             
392             /* Update potential sum for this i atom from the interaction with this j atom. */
393             velecsum         = _mm256_add_ps(velecsum,velec);
394
395             fscal            = felec;
396
397             /* Calculate temporary vectorial force */
398             tx               = _mm256_mul_ps(fscal,dx11);
399             ty               = _mm256_mul_ps(fscal,dy11);
400             tz               = _mm256_mul_ps(fscal,dz11);
401
402             /* Update vectorial force */
403             fix1             = _mm256_add_ps(fix1,tx);
404             fiy1             = _mm256_add_ps(fiy1,ty);
405             fiz1             = _mm256_add_ps(fiz1,tz);
406
407             fjx1             = _mm256_add_ps(fjx1,tx);
408             fjy1             = _mm256_add_ps(fjy1,ty);
409             fjz1             = _mm256_add_ps(fjz1,tz);
410
411             /**************************
412              * CALCULATE INTERACTIONS *
413              **************************/
414
415             r12              = _mm256_mul_ps(rsq12,rinv12);
416
417             /* EWALD ELECTROSTATICS */
418             
419             /* Analytical PME correction */
420             zeta2            = _mm256_mul_ps(beta2,rsq12);
421             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
422             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
423             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
424             felec            = _mm256_mul_ps(qq12,felec);
425             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
426             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
427             velec            = _mm256_sub_ps(rinv12,pmecorrV);
428             velec            = _mm256_mul_ps(qq12,velec);
429             
430             /* Update potential sum for this i atom from the interaction with this j atom. */
431             velecsum         = _mm256_add_ps(velecsum,velec);
432
433             fscal            = felec;
434
435             /* Calculate temporary vectorial force */
436             tx               = _mm256_mul_ps(fscal,dx12);
437             ty               = _mm256_mul_ps(fscal,dy12);
438             tz               = _mm256_mul_ps(fscal,dz12);
439
440             /* Update vectorial force */
441             fix1             = _mm256_add_ps(fix1,tx);
442             fiy1             = _mm256_add_ps(fiy1,ty);
443             fiz1             = _mm256_add_ps(fiz1,tz);
444
445             fjx2             = _mm256_add_ps(fjx2,tx);
446             fjy2             = _mm256_add_ps(fjy2,ty);
447             fjz2             = _mm256_add_ps(fjz2,tz);
448
449             /**************************
450              * CALCULATE INTERACTIONS *
451              **************************/
452
453             r13              = _mm256_mul_ps(rsq13,rinv13);
454
455             /* EWALD ELECTROSTATICS */
456             
457             /* Analytical PME correction */
458             zeta2            = _mm256_mul_ps(beta2,rsq13);
459             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
460             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
461             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
462             felec            = _mm256_mul_ps(qq13,felec);
463             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
464             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
465             velec            = _mm256_sub_ps(rinv13,pmecorrV);
466             velec            = _mm256_mul_ps(qq13,velec);
467             
468             /* Update potential sum for this i atom from the interaction with this j atom. */
469             velecsum         = _mm256_add_ps(velecsum,velec);
470
471             fscal            = felec;
472
473             /* Calculate temporary vectorial force */
474             tx               = _mm256_mul_ps(fscal,dx13);
475             ty               = _mm256_mul_ps(fscal,dy13);
476             tz               = _mm256_mul_ps(fscal,dz13);
477
478             /* Update vectorial force */
479             fix1             = _mm256_add_ps(fix1,tx);
480             fiy1             = _mm256_add_ps(fiy1,ty);
481             fiz1             = _mm256_add_ps(fiz1,tz);
482
483             fjx3             = _mm256_add_ps(fjx3,tx);
484             fjy3             = _mm256_add_ps(fjy3,ty);
485             fjz3             = _mm256_add_ps(fjz3,tz);
486
487             /**************************
488              * CALCULATE INTERACTIONS *
489              **************************/
490
491             r21              = _mm256_mul_ps(rsq21,rinv21);
492
493             /* EWALD ELECTROSTATICS */
494             
495             /* Analytical PME correction */
496             zeta2            = _mm256_mul_ps(beta2,rsq21);
497             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
498             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
499             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
500             felec            = _mm256_mul_ps(qq21,felec);
501             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
502             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
503             velec            = _mm256_sub_ps(rinv21,pmecorrV);
504             velec            = _mm256_mul_ps(qq21,velec);
505             
506             /* Update potential sum for this i atom from the interaction with this j atom. */
507             velecsum         = _mm256_add_ps(velecsum,velec);
508
509             fscal            = felec;
510
511             /* Calculate temporary vectorial force */
512             tx               = _mm256_mul_ps(fscal,dx21);
513             ty               = _mm256_mul_ps(fscal,dy21);
514             tz               = _mm256_mul_ps(fscal,dz21);
515
516             /* Update vectorial force */
517             fix2             = _mm256_add_ps(fix2,tx);
518             fiy2             = _mm256_add_ps(fiy2,ty);
519             fiz2             = _mm256_add_ps(fiz2,tz);
520
521             fjx1             = _mm256_add_ps(fjx1,tx);
522             fjy1             = _mm256_add_ps(fjy1,ty);
523             fjz1             = _mm256_add_ps(fjz1,tz);
524
525             /**************************
526              * CALCULATE INTERACTIONS *
527              **************************/
528
529             r22              = _mm256_mul_ps(rsq22,rinv22);
530
531             /* EWALD ELECTROSTATICS */
532             
533             /* Analytical PME correction */
534             zeta2            = _mm256_mul_ps(beta2,rsq22);
535             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
536             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
537             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
538             felec            = _mm256_mul_ps(qq22,felec);
539             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
540             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
541             velec            = _mm256_sub_ps(rinv22,pmecorrV);
542             velec            = _mm256_mul_ps(qq22,velec);
543             
544             /* Update potential sum for this i atom from the interaction with this j atom. */
545             velecsum         = _mm256_add_ps(velecsum,velec);
546
547             fscal            = felec;
548
549             /* Calculate temporary vectorial force */
550             tx               = _mm256_mul_ps(fscal,dx22);
551             ty               = _mm256_mul_ps(fscal,dy22);
552             tz               = _mm256_mul_ps(fscal,dz22);
553
554             /* Update vectorial force */
555             fix2             = _mm256_add_ps(fix2,tx);
556             fiy2             = _mm256_add_ps(fiy2,ty);
557             fiz2             = _mm256_add_ps(fiz2,tz);
558
559             fjx2             = _mm256_add_ps(fjx2,tx);
560             fjy2             = _mm256_add_ps(fjy2,ty);
561             fjz2             = _mm256_add_ps(fjz2,tz);
562
563             /**************************
564              * CALCULATE INTERACTIONS *
565              **************************/
566
567             r23              = _mm256_mul_ps(rsq23,rinv23);
568
569             /* EWALD ELECTROSTATICS */
570             
571             /* Analytical PME correction */
572             zeta2            = _mm256_mul_ps(beta2,rsq23);
573             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
574             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
575             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
576             felec            = _mm256_mul_ps(qq23,felec);
577             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
578             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
579             velec            = _mm256_sub_ps(rinv23,pmecorrV);
580             velec            = _mm256_mul_ps(qq23,velec);
581             
582             /* Update potential sum for this i atom from the interaction with this j atom. */
583             velecsum         = _mm256_add_ps(velecsum,velec);
584
585             fscal            = felec;
586
587             /* Calculate temporary vectorial force */
588             tx               = _mm256_mul_ps(fscal,dx23);
589             ty               = _mm256_mul_ps(fscal,dy23);
590             tz               = _mm256_mul_ps(fscal,dz23);
591
592             /* Update vectorial force */
593             fix2             = _mm256_add_ps(fix2,tx);
594             fiy2             = _mm256_add_ps(fiy2,ty);
595             fiz2             = _mm256_add_ps(fiz2,tz);
596
597             fjx3             = _mm256_add_ps(fjx3,tx);
598             fjy3             = _mm256_add_ps(fjy3,ty);
599             fjz3             = _mm256_add_ps(fjz3,tz);
600
601             /**************************
602              * CALCULATE INTERACTIONS *
603              **************************/
604
605             r31              = _mm256_mul_ps(rsq31,rinv31);
606
607             /* EWALD ELECTROSTATICS */
608             
609             /* Analytical PME correction */
610             zeta2            = _mm256_mul_ps(beta2,rsq31);
611             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
612             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
613             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
614             felec            = _mm256_mul_ps(qq31,felec);
615             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
616             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
617             velec            = _mm256_sub_ps(rinv31,pmecorrV);
618             velec            = _mm256_mul_ps(qq31,velec);
619             
620             /* Update potential sum for this i atom from the interaction with this j atom. */
621             velecsum         = _mm256_add_ps(velecsum,velec);
622
623             fscal            = felec;
624
625             /* Calculate temporary vectorial force */
626             tx               = _mm256_mul_ps(fscal,dx31);
627             ty               = _mm256_mul_ps(fscal,dy31);
628             tz               = _mm256_mul_ps(fscal,dz31);
629
630             /* Update vectorial force */
631             fix3             = _mm256_add_ps(fix3,tx);
632             fiy3             = _mm256_add_ps(fiy3,ty);
633             fiz3             = _mm256_add_ps(fiz3,tz);
634
635             fjx1             = _mm256_add_ps(fjx1,tx);
636             fjy1             = _mm256_add_ps(fjy1,ty);
637             fjz1             = _mm256_add_ps(fjz1,tz);
638
639             /**************************
640              * CALCULATE INTERACTIONS *
641              **************************/
642
643             r32              = _mm256_mul_ps(rsq32,rinv32);
644
645             /* EWALD ELECTROSTATICS */
646             
647             /* Analytical PME correction */
648             zeta2            = _mm256_mul_ps(beta2,rsq32);
649             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
650             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
651             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
652             felec            = _mm256_mul_ps(qq32,felec);
653             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
654             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
655             velec            = _mm256_sub_ps(rinv32,pmecorrV);
656             velec            = _mm256_mul_ps(qq32,velec);
657             
658             /* Update potential sum for this i atom from the interaction with this j atom. */
659             velecsum         = _mm256_add_ps(velecsum,velec);
660
661             fscal            = felec;
662
663             /* Calculate temporary vectorial force */
664             tx               = _mm256_mul_ps(fscal,dx32);
665             ty               = _mm256_mul_ps(fscal,dy32);
666             tz               = _mm256_mul_ps(fscal,dz32);
667
668             /* Update vectorial force */
669             fix3             = _mm256_add_ps(fix3,tx);
670             fiy3             = _mm256_add_ps(fiy3,ty);
671             fiz3             = _mm256_add_ps(fiz3,tz);
672
673             fjx2             = _mm256_add_ps(fjx2,tx);
674             fjy2             = _mm256_add_ps(fjy2,ty);
675             fjz2             = _mm256_add_ps(fjz2,tz);
676
677             /**************************
678              * CALCULATE INTERACTIONS *
679              **************************/
680
681             r33              = _mm256_mul_ps(rsq33,rinv33);
682
683             /* EWALD ELECTROSTATICS */
684             
685             /* Analytical PME correction */
686             zeta2            = _mm256_mul_ps(beta2,rsq33);
687             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
688             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
689             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
690             felec            = _mm256_mul_ps(qq33,felec);
691             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
692             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
693             velec            = _mm256_sub_ps(rinv33,pmecorrV);
694             velec            = _mm256_mul_ps(qq33,velec);
695             
696             /* Update potential sum for this i atom from the interaction with this j atom. */
697             velecsum         = _mm256_add_ps(velecsum,velec);
698
699             fscal            = felec;
700
701             /* Calculate temporary vectorial force */
702             tx               = _mm256_mul_ps(fscal,dx33);
703             ty               = _mm256_mul_ps(fscal,dy33);
704             tz               = _mm256_mul_ps(fscal,dz33);
705
706             /* Update vectorial force */
707             fix3             = _mm256_add_ps(fix3,tx);
708             fiy3             = _mm256_add_ps(fiy3,ty);
709             fiz3             = _mm256_add_ps(fiz3,tz);
710
711             fjx3             = _mm256_add_ps(fjx3,tx);
712             fjy3             = _mm256_add_ps(fjy3,ty);
713             fjz3             = _mm256_add_ps(fjz3,tz);
714
715             fjptrA             = f+j_coord_offsetA;
716             fjptrB             = f+j_coord_offsetB;
717             fjptrC             = f+j_coord_offsetC;
718             fjptrD             = f+j_coord_offsetD;
719             fjptrE             = f+j_coord_offsetE;
720             fjptrF             = f+j_coord_offsetF;
721             fjptrG             = f+j_coord_offsetG;
722             fjptrH             = f+j_coord_offsetH;
723
724             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
725                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
726                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
727
728             /* Inner loop uses 791 flops */
729         }
730
731         if(jidx<j_index_end)
732         {
733
734             /* Get j neighbor index, and coordinate index */
735             jnrlistA         = jjnr[jidx];
736             jnrlistB         = jjnr[jidx+1];
737             jnrlistC         = jjnr[jidx+2];
738             jnrlistD         = jjnr[jidx+3];
739             jnrlistE         = jjnr[jidx+4];
740             jnrlistF         = jjnr[jidx+5];
741             jnrlistG         = jjnr[jidx+6];
742             jnrlistH         = jjnr[jidx+7];
743             /* Sign of each element will be negative for non-real atoms.
744              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
745              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
746              */
747             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
748                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
749                                             
750             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
751             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
752             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
753             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
754             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
755             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
756             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
757             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
758             j_coord_offsetA  = DIM*jnrA;
759             j_coord_offsetB  = DIM*jnrB;
760             j_coord_offsetC  = DIM*jnrC;
761             j_coord_offsetD  = DIM*jnrD;
762             j_coord_offsetE  = DIM*jnrE;
763             j_coord_offsetF  = DIM*jnrF;
764             j_coord_offsetG  = DIM*jnrG;
765             j_coord_offsetH  = DIM*jnrH;
766
767             /* load j atom coordinates */
768             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
769                                                  x+j_coord_offsetC,x+j_coord_offsetD,
770                                                  x+j_coord_offsetE,x+j_coord_offsetF,
771                                                  x+j_coord_offsetG,x+j_coord_offsetH,
772                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
773                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
774
775             /* Calculate displacement vector */
776             dx00             = _mm256_sub_ps(ix0,jx0);
777             dy00             = _mm256_sub_ps(iy0,jy0);
778             dz00             = _mm256_sub_ps(iz0,jz0);
779             dx11             = _mm256_sub_ps(ix1,jx1);
780             dy11             = _mm256_sub_ps(iy1,jy1);
781             dz11             = _mm256_sub_ps(iz1,jz1);
782             dx12             = _mm256_sub_ps(ix1,jx2);
783             dy12             = _mm256_sub_ps(iy1,jy2);
784             dz12             = _mm256_sub_ps(iz1,jz2);
785             dx13             = _mm256_sub_ps(ix1,jx3);
786             dy13             = _mm256_sub_ps(iy1,jy3);
787             dz13             = _mm256_sub_ps(iz1,jz3);
788             dx21             = _mm256_sub_ps(ix2,jx1);
789             dy21             = _mm256_sub_ps(iy2,jy1);
790             dz21             = _mm256_sub_ps(iz2,jz1);
791             dx22             = _mm256_sub_ps(ix2,jx2);
792             dy22             = _mm256_sub_ps(iy2,jy2);
793             dz22             = _mm256_sub_ps(iz2,jz2);
794             dx23             = _mm256_sub_ps(ix2,jx3);
795             dy23             = _mm256_sub_ps(iy2,jy3);
796             dz23             = _mm256_sub_ps(iz2,jz3);
797             dx31             = _mm256_sub_ps(ix3,jx1);
798             dy31             = _mm256_sub_ps(iy3,jy1);
799             dz31             = _mm256_sub_ps(iz3,jz1);
800             dx32             = _mm256_sub_ps(ix3,jx2);
801             dy32             = _mm256_sub_ps(iy3,jy2);
802             dz32             = _mm256_sub_ps(iz3,jz2);
803             dx33             = _mm256_sub_ps(ix3,jx3);
804             dy33             = _mm256_sub_ps(iy3,jy3);
805             dz33             = _mm256_sub_ps(iz3,jz3);
806
807             /* Calculate squared distance and things based on it */
808             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
809             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
810             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
811             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
812             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
813             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
814             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
815             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
816             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
817             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
818
819             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
820             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
821             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
822             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
823             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
824             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
825             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
826             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
827             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
828
829             rinvsq00         = gmx_mm256_inv_ps(rsq00);
830             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
831             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
832             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
833             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
834             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
835             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
836             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
837             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
838             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
839
840             fjx0             = _mm256_setzero_ps();
841             fjy0             = _mm256_setzero_ps();
842             fjz0             = _mm256_setzero_ps();
843             fjx1             = _mm256_setzero_ps();
844             fjy1             = _mm256_setzero_ps();
845             fjz1             = _mm256_setzero_ps();
846             fjx2             = _mm256_setzero_ps();
847             fjy2             = _mm256_setzero_ps();
848             fjz2             = _mm256_setzero_ps();
849             fjx3             = _mm256_setzero_ps();
850             fjy3             = _mm256_setzero_ps();
851             fjz3             = _mm256_setzero_ps();
852
853             /**************************
854              * CALCULATE INTERACTIONS *
855              **************************/
856
857             /* LENNARD-JONES DISPERSION/REPULSION */
858
859             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
860             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
861             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
862             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
863             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
864
865             /* Update potential sum for this i atom from the interaction with this j atom. */
866             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
867             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
868
869             fscal            = fvdw;
870
871             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
872
873             /* Calculate temporary vectorial force */
874             tx               = _mm256_mul_ps(fscal,dx00);
875             ty               = _mm256_mul_ps(fscal,dy00);
876             tz               = _mm256_mul_ps(fscal,dz00);
877
878             /* Update vectorial force */
879             fix0             = _mm256_add_ps(fix0,tx);
880             fiy0             = _mm256_add_ps(fiy0,ty);
881             fiz0             = _mm256_add_ps(fiz0,tz);
882
883             fjx0             = _mm256_add_ps(fjx0,tx);
884             fjy0             = _mm256_add_ps(fjy0,ty);
885             fjz0             = _mm256_add_ps(fjz0,tz);
886
887             /**************************
888              * CALCULATE INTERACTIONS *
889              **************************/
890
891             r11              = _mm256_mul_ps(rsq11,rinv11);
892             r11              = _mm256_andnot_ps(dummy_mask,r11);
893
894             /* EWALD ELECTROSTATICS */
895             
896             /* Analytical PME correction */
897             zeta2            = _mm256_mul_ps(beta2,rsq11);
898             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
899             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
900             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
901             felec            = _mm256_mul_ps(qq11,felec);
902             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
903             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
904             velec            = _mm256_sub_ps(rinv11,pmecorrV);
905             velec            = _mm256_mul_ps(qq11,velec);
906             
907             /* Update potential sum for this i atom from the interaction with this j atom. */
908             velec            = _mm256_andnot_ps(dummy_mask,velec);
909             velecsum         = _mm256_add_ps(velecsum,velec);
910
911             fscal            = felec;
912
913             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
914
915             /* Calculate temporary vectorial force */
916             tx               = _mm256_mul_ps(fscal,dx11);
917             ty               = _mm256_mul_ps(fscal,dy11);
918             tz               = _mm256_mul_ps(fscal,dz11);
919
920             /* Update vectorial force */
921             fix1             = _mm256_add_ps(fix1,tx);
922             fiy1             = _mm256_add_ps(fiy1,ty);
923             fiz1             = _mm256_add_ps(fiz1,tz);
924
925             fjx1             = _mm256_add_ps(fjx1,tx);
926             fjy1             = _mm256_add_ps(fjy1,ty);
927             fjz1             = _mm256_add_ps(fjz1,tz);
928
929             /**************************
930              * CALCULATE INTERACTIONS *
931              **************************/
932
933             r12              = _mm256_mul_ps(rsq12,rinv12);
934             r12              = _mm256_andnot_ps(dummy_mask,r12);
935
936             /* EWALD ELECTROSTATICS */
937             
938             /* Analytical PME correction */
939             zeta2            = _mm256_mul_ps(beta2,rsq12);
940             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
941             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
942             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
943             felec            = _mm256_mul_ps(qq12,felec);
944             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
945             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
946             velec            = _mm256_sub_ps(rinv12,pmecorrV);
947             velec            = _mm256_mul_ps(qq12,velec);
948             
949             /* Update potential sum for this i atom from the interaction with this j atom. */
950             velec            = _mm256_andnot_ps(dummy_mask,velec);
951             velecsum         = _mm256_add_ps(velecsum,velec);
952
953             fscal            = felec;
954
955             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
956
957             /* Calculate temporary vectorial force */
958             tx               = _mm256_mul_ps(fscal,dx12);
959             ty               = _mm256_mul_ps(fscal,dy12);
960             tz               = _mm256_mul_ps(fscal,dz12);
961
962             /* Update vectorial force */
963             fix1             = _mm256_add_ps(fix1,tx);
964             fiy1             = _mm256_add_ps(fiy1,ty);
965             fiz1             = _mm256_add_ps(fiz1,tz);
966
967             fjx2             = _mm256_add_ps(fjx2,tx);
968             fjy2             = _mm256_add_ps(fjy2,ty);
969             fjz2             = _mm256_add_ps(fjz2,tz);
970
971             /**************************
972              * CALCULATE INTERACTIONS *
973              **************************/
974
975             r13              = _mm256_mul_ps(rsq13,rinv13);
976             r13              = _mm256_andnot_ps(dummy_mask,r13);
977
978             /* EWALD ELECTROSTATICS */
979             
980             /* Analytical PME correction */
981             zeta2            = _mm256_mul_ps(beta2,rsq13);
982             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
983             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
984             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
985             felec            = _mm256_mul_ps(qq13,felec);
986             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
987             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
988             velec            = _mm256_sub_ps(rinv13,pmecorrV);
989             velec            = _mm256_mul_ps(qq13,velec);
990             
991             /* Update potential sum for this i atom from the interaction with this j atom. */
992             velec            = _mm256_andnot_ps(dummy_mask,velec);
993             velecsum         = _mm256_add_ps(velecsum,velec);
994
995             fscal            = felec;
996
997             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
998
999             /* Calculate temporary vectorial force */
1000             tx               = _mm256_mul_ps(fscal,dx13);
1001             ty               = _mm256_mul_ps(fscal,dy13);
1002             tz               = _mm256_mul_ps(fscal,dz13);
1003
1004             /* Update vectorial force */
1005             fix1             = _mm256_add_ps(fix1,tx);
1006             fiy1             = _mm256_add_ps(fiy1,ty);
1007             fiz1             = _mm256_add_ps(fiz1,tz);
1008
1009             fjx3             = _mm256_add_ps(fjx3,tx);
1010             fjy3             = _mm256_add_ps(fjy3,ty);
1011             fjz3             = _mm256_add_ps(fjz3,tz);
1012
1013             /**************************
1014              * CALCULATE INTERACTIONS *
1015              **************************/
1016
1017             r21              = _mm256_mul_ps(rsq21,rinv21);
1018             r21              = _mm256_andnot_ps(dummy_mask,r21);
1019
1020             /* EWALD ELECTROSTATICS */
1021             
1022             /* Analytical PME correction */
1023             zeta2            = _mm256_mul_ps(beta2,rsq21);
1024             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1025             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1026             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1027             felec            = _mm256_mul_ps(qq21,felec);
1028             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1029             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1030             velec            = _mm256_sub_ps(rinv21,pmecorrV);
1031             velec            = _mm256_mul_ps(qq21,velec);
1032             
1033             /* Update potential sum for this i atom from the interaction with this j atom. */
1034             velec            = _mm256_andnot_ps(dummy_mask,velec);
1035             velecsum         = _mm256_add_ps(velecsum,velec);
1036
1037             fscal            = felec;
1038
1039             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1040
1041             /* Calculate temporary vectorial force */
1042             tx               = _mm256_mul_ps(fscal,dx21);
1043             ty               = _mm256_mul_ps(fscal,dy21);
1044             tz               = _mm256_mul_ps(fscal,dz21);
1045
1046             /* Update vectorial force */
1047             fix2             = _mm256_add_ps(fix2,tx);
1048             fiy2             = _mm256_add_ps(fiy2,ty);
1049             fiz2             = _mm256_add_ps(fiz2,tz);
1050
1051             fjx1             = _mm256_add_ps(fjx1,tx);
1052             fjy1             = _mm256_add_ps(fjy1,ty);
1053             fjz1             = _mm256_add_ps(fjz1,tz);
1054
1055             /**************************
1056              * CALCULATE INTERACTIONS *
1057              **************************/
1058
1059             r22              = _mm256_mul_ps(rsq22,rinv22);
1060             r22              = _mm256_andnot_ps(dummy_mask,r22);
1061
1062             /* EWALD ELECTROSTATICS */
1063             
1064             /* Analytical PME correction */
1065             zeta2            = _mm256_mul_ps(beta2,rsq22);
1066             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1067             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1068             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1069             felec            = _mm256_mul_ps(qq22,felec);
1070             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1071             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1072             velec            = _mm256_sub_ps(rinv22,pmecorrV);
1073             velec            = _mm256_mul_ps(qq22,velec);
1074             
1075             /* Update potential sum for this i atom from the interaction with this j atom. */
1076             velec            = _mm256_andnot_ps(dummy_mask,velec);
1077             velecsum         = _mm256_add_ps(velecsum,velec);
1078
1079             fscal            = felec;
1080
1081             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1082
1083             /* Calculate temporary vectorial force */
1084             tx               = _mm256_mul_ps(fscal,dx22);
1085             ty               = _mm256_mul_ps(fscal,dy22);
1086             tz               = _mm256_mul_ps(fscal,dz22);
1087
1088             /* Update vectorial force */
1089             fix2             = _mm256_add_ps(fix2,tx);
1090             fiy2             = _mm256_add_ps(fiy2,ty);
1091             fiz2             = _mm256_add_ps(fiz2,tz);
1092
1093             fjx2             = _mm256_add_ps(fjx2,tx);
1094             fjy2             = _mm256_add_ps(fjy2,ty);
1095             fjz2             = _mm256_add_ps(fjz2,tz);
1096
1097             /**************************
1098              * CALCULATE INTERACTIONS *
1099              **************************/
1100
1101             r23              = _mm256_mul_ps(rsq23,rinv23);
1102             r23              = _mm256_andnot_ps(dummy_mask,r23);
1103
1104             /* EWALD ELECTROSTATICS */
1105             
1106             /* Analytical PME correction */
1107             zeta2            = _mm256_mul_ps(beta2,rsq23);
1108             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
1109             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1110             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1111             felec            = _mm256_mul_ps(qq23,felec);
1112             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1113             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1114             velec            = _mm256_sub_ps(rinv23,pmecorrV);
1115             velec            = _mm256_mul_ps(qq23,velec);
1116             
1117             /* Update potential sum for this i atom from the interaction with this j atom. */
1118             velec            = _mm256_andnot_ps(dummy_mask,velec);
1119             velecsum         = _mm256_add_ps(velecsum,velec);
1120
1121             fscal            = felec;
1122
1123             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1124
1125             /* Calculate temporary vectorial force */
1126             tx               = _mm256_mul_ps(fscal,dx23);
1127             ty               = _mm256_mul_ps(fscal,dy23);
1128             tz               = _mm256_mul_ps(fscal,dz23);
1129
1130             /* Update vectorial force */
1131             fix2             = _mm256_add_ps(fix2,tx);
1132             fiy2             = _mm256_add_ps(fiy2,ty);
1133             fiz2             = _mm256_add_ps(fiz2,tz);
1134
1135             fjx3             = _mm256_add_ps(fjx3,tx);
1136             fjy3             = _mm256_add_ps(fjy3,ty);
1137             fjz3             = _mm256_add_ps(fjz3,tz);
1138
1139             /**************************
1140              * CALCULATE INTERACTIONS *
1141              **************************/
1142
1143             r31              = _mm256_mul_ps(rsq31,rinv31);
1144             r31              = _mm256_andnot_ps(dummy_mask,r31);
1145
1146             /* EWALD ELECTROSTATICS */
1147             
1148             /* Analytical PME correction */
1149             zeta2            = _mm256_mul_ps(beta2,rsq31);
1150             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
1151             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1152             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1153             felec            = _mm256_mul_ps(qq31,felec);
1154             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1155             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1156             velec            = _mm256_sub_ps(rinv31,pmecorrV);
1157             velec            = _mm256_mul_ps(qq31,velec);
1158             
1159             /* Update potential sum for this i atom from the interaction with this j atom. */
1160             velec            = _mm256_andnot_ps(dummy_mask,velec);
1161             velecsum         = _mm256_add_ps(velecsum,velec);
1162
1163             fscal            = felec;
1164
1165             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1166
1167             /* Calculate temporary vectorial force */
1168             tx               = _mm256_mul_ps(fscal,dx31);
1169             ty               = _mm256_mul_ps(fscal,dy31);
1170             tz               = _mm256_mul_ps(fscal,dz31);
1171
1172             /* Update vectorial force */
1173             fix3             = _mm256_add_ps(fix3,tx);
1174             fiy3             = _mm256_add_ps(fiy3,ty);
1175             fiz3             = _mm256_add_ps(fiz3,tz);
1176
1177             fjx1             = _mm256_add_ps(fjx1,tx);
1178             fjy1             = _mm256_add_ps(fjy1,ty);
1179             fjz1             = _mm256_add_ps(fjz1,tz);
1180
1181             /**************************
1182              * CALCULATE INTERACTIONS *
1183              **************************/
1184
1185             r32              = _mm256_mul_ps(rsq32,rinv32);
1186             r32              = _mm256_andnot_ps(dummy_mask,r32);
1187
1188             /* EWALD ELECTROSTATICS */
1189             
1190             /* Analytical PME correction */
1191             zeta2            = _mm256_mul_ps(beta2,rsq32);
1192             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
1193             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1194             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1195             felec            = _mm256_mul_ps(qq32,felec);
1196             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1197             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1198             velec            = _mm256_sub_ps(rinv32,pmecorrV);
1199             velec            = _mm256_mul_ps(qq32,velec);
1200             
1201             /* Update potential sum for this i atom from the interaction with this j atom. */
1202             velec            = _mm256_andnot_ps(dummy_mask,velec);
1203             velecsum         = _mm256_add_ps(velecsum,velec);
1204
1205             fscal            = felec;
1206
1207             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1208
1209             /* Calculate temporary vectorial force */
1210             tx               = _mm256_mul_ps(fscal,dx32);
1211             ty               = _mm256_mul_ps(fscal,dy32);
1212             tz               = _mm256_mul_ps(fscal,dz32);
1213
1214             /* Update vectorial force */
1215             fix3             = _mm256_add_ps(fix3,tx);
1216             fiy3             = _mm256_add_ps(fiy3,ty);
1217             fiz3             = _mm256_add_ps(fiz3,tz);
1218
1219             fjx2             = _mm256_add_ps(fjx2,tx);
1220             fjy2             = _mm256_add_ps(fjy2,ty);
1221             fjz2             = _mm256_add_ps(fjz2,tz);
1222
1223             /**************************
1224              * CALCULATE INTERACTIONS *
1225              **************************/
1226
1227             r33              = _mm256_mul_ps(rsq33,rinv33);
1228             r33              = _mm256_andnot_ps(dummy_mask,r33);
1229
1230             /* EWALD ELECTROSTATICS */
1231             
1232             /* Analytical PME correction */
1233             zeta2            = _mm256_mul_ps(beta2,rsq33);
1234             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
1235             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1236             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1237             felec            = _mm256_mul_ps(qq33,felec);
1238             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1239             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1240             velec            = _mm256_sub_ps(rinv33,pmecorrV);
1241             velec            = _mm256_mul_ps(qq33,velec);
1242             
1243             /* Update potential sum for this i atom from the interaction with this j atom. */
1244             velec            = _mm256_andnot_ps(dummy_mask,velec);
1245             velecsum         = _mm256_add_ps(velecsum,velec);
1246
1247             fscal            = felec;
1248
1249             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1250
1251             /* Calculate temporary vectorial force */
1252             tx               = _mm256_mul_ps(fscal,dx33);
1253             ty               = _mm256_mul_ps(fscal,dy33);
1254             tz               = _mm256_mul_ps(fscal,dz33);
1255
1256             /* Update vectorial force */
1257             fix3             = _mm256_add_ps(fix3,tx);
1258             fiy3             = _mm256_add_ps(fiy3,ty);
1259             fiz3             = _mm256_add_ps(fiz3,tz);
1260
1261             fjx3             = _mm256_add_ps(fjx3,tx);
1262             fjy3             = _mm256_add_ps(fjy3,ty);
1263             fjz3             = _mm256_add_ps(fjz3,tz);
1264
1265             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1266             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1267             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1268             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1269             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1270             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1271             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1272             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1273
1274             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1275                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1276                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1277
1278             /* Inner loop uses 800 flops */
1279         }
1280
1281         /* End of innermost loop */
1282
1283         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1284                                                  f+i_coord_offset,fshift+i_shift_offset);
1285
1286         ggid                        = gid[iidx];
1287         /* Update potential energies */
1288         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1289         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1290
1291         /* Increment number of inner iterations */
1292         inneriter                  += j_index_end - j_index_start;
1293
1294         /* Outer loop uses 26 flops */
1295     }
1296
1297     /* Increment number of outer iterations */
1298     outeriter        += nri;
1299
1300     /* Update outer/inner flops */
1301
1302     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*800);
1303 }
1304 /*
1305  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_single
1306  * Electrostatics interaction: Ewald
1307  * VdW interaction:            LennardJones
1308  * Geometry:                   Water4-Water4
1309  * Calculate force/pot:        Force
1310  */
1311 void
1312 nb_kernel_ElecEw_VdwLJ_GeomW4W4_F_avx_256_single
1313                     (t_nblist                    * gmx_restrict       nlist,
1314                      rvec                        * gmx_restrict          xx,
1315                      rvec                        * gmx_restrict          ff,
1316                      t_forcerec                  * gmx_restrict          fr,
1317                      t_mdatoms                   * gmx_restrict     mdatoms,
1318                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1319                      t_nrnb                      * gmx_restrict        nrnb)
1320 {
1321     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1322      * just 0 for non-waters.
1323      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1324      * jnr indices corresponding to data put in the four positions in the SIMD register.
1325      */
1326     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1327     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1328     int              jnrA,jnrB,jnrC,jnrD;
1329     int              jnrE,jnrF,jnrG,jnrH;
1330     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1331     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1332     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1333     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1334     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1335     real             rcutoff_scalar;
1336     real             *shiftvec,*fshift,*x,*f;
1337     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1338     real             scratch[4*DIM];
1339     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1340     real *           vdwioffsetptr0;
1341     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1342     real *           vdwioffsetptr1;
1343     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1344     real *           vdwioffsetptr2;
1345     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1346     real *           vdwioffsetptr3;
1347     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1348     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1349     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1350     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1351     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1352     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1353     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1354     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1355     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1356     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1357     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1358     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1359     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1360     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1361     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1362     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1363     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1364     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1365     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1366     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1367     real             *charge;
1368     int              nvdwtype;
1369     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1370     int              *vdwtype;
1371     real             *vdwparam;
1372     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1373     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1374     __m256i          ewitab;
1375     __m128i          ewitab_lo,ewitab_hi;
1376     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1377     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1378     real             *ewtab;
1379     __m256           dummy_mask,cutoff_mask;
1380     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1381     __m256           one     = _mm256_set1_ps(1.0);
1382     __m256           two     = _mm256_set1_ps(2.0);
1383     x                = xx[0];
1384     f                = ff[0];
1385
1386     nri              = nlist->nri;
1387     iinr             = nlist->iinr;
1388     jindex           = nlist->jindex;
1389     jjnr             = nlist->jjnr;
1390     shiftidx         = nlist->shift;
1391     gid              = nlist->gid;
1392     shiftvec         = fr->shift_vec[0];
1393     fshift           = fr->fshift[0];
1394     facel            = _mm256_set1_ps(fr->epsfac);
1395     charge           = mdatoms->chargeA;
1396     nvdwtype         = fr->ntype;
1397     vdwparam         = fr->nbfp;
1398     vdwtype          = mdatoms->typeA;
1399
1400     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1401     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1402     beta2            = _mm256_mul_ps(beta,beta);
1403     beta3            = _mm256_mul_ps(beta,beta2);
1404
1405     ewtab            = fr->ic->tabq_coul_F;
1406     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1407     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1408
1409     /* Setup water-specific parameters */
1410     inr              = nlist->iinr[0];
1411     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1412     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1413     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1414     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1415
1416     jq1              = _mm256_set1_ps(charge[inr+1]);
1417     jq2              = _mm256_set1_ps(charge[inr+2]);
1418     jq3              = _mm256_set1_ps(charge[inr+3]);
1419     vdwjidx0A        = 2*vdwtype[inr+0];
1420     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1421     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1422     qq11             = _mm256_mul_ps(iq1,jq1);
1423     qq12             = _mm256_mul_ps(iq1,jq2);
1424     qq13             = _mm256_mul_ps(iq1,jq3);
1425     qq21             = _mm256_mul_ps(iq2,jq1);
1426     qq22             = _mm256_mul_ps(iq2,jq2);
1427     qq23             = _mm256_mul_ps(iq2,jq3);
1428     qq31             = _mm256_mul_ps(iq3,jq1);
1429     qq32             = _mm256_mul_ps(iq3,jq2);
1430     qq33             = _mm256_mul_ps(iq3,jq3);
1431
1432     /* Avoid stupid compiler warnings */
1433     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1434     j_coord_offsetA = 0;
1435     j_coord_offsetB = 0;
1436     j_coord_offsetC = 0;
1437     j_coord_offsetD = 0;
1438     j_coord_offsetE = 0;
1439     j_coord_offsetF = 0;
1440     j_coord_offsetG = 0;
1441     j_coord_offsetH = 0;
1442
1443     outeriter        = 0;
1444     inneriter        = 0;
1445
1446     for(iidx=0;iidx<4*DIM;iidx++)
1447     {
1448         scratch[iidx] = 0.0;
1449     }
1450
1451     /* Start outer loop over neighborlists */
1452     for(iidx=0; iidx<nri; iidx++)
1453     {
1454         /* Load shift vector for this list */
1455         i_shift_offset   = DIM*shiftidx[iidx];
1456
1457         /* Load limits for loop over neighbors */
1458         j_index_start    = jindex[iidx];
1459         j_index_end      = jindex[iidx+1];
1460
1461         /* Get outer coordinate index */
1462         inr              = iinr[iidx];
1463         i_coord_offset   = DIM*inr;
1464
1465         /* Load i particle coords and add shift vector */
1466         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1467                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1468
1469         fix0             = _mm256_setzero_ps();
1470         fiy0             = _mm256_setzero_ps();
1471         fiz0             = _mm256_setzero_ps();
1472         fix1             = _mm256_setzero_ps();
1473         fiy1             = _mm256_setzero_ps();
1474         fiz1             = _mm256_setzero_ps();
1475         fix2             = _mm256_setzero_ps();
1476         fiy2             = _mm256_setzero_ps();
1477         fiz2             = _mm256_setzero_ps();
1478         fix3             = _mm256_setzero_ps();
1479         fiy3             = _mm256_setzero_ps();
1480         fiz3             = _mm256_setzero_ps();
1481
1482         /* Start inner kernel loop */
1483         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1484         {
1485
1486             /* Get j neighbor index, and coordinate index */
1487             jnrA             = jjnr[jidx];
1488             jnrB             = jjnr[jidx+1];
1489             jnrC             = jjnr[jidx+2];
1490             jnrD             = jjnr[jidx+3];
1491             jnrE             = jjnr[jidx+4];
1492             jnrF             = jjnr[jidx+5];
1493             jnrG             = jjnr[jidx+6];
1494             jnrH             = jjnr[jidx+7];
1495             j_coord_offsetA  = DIM*jnrA;
1496             j_coord_offsetB  = DIM*jnrB;
1497             j_coord_offsetC  = DIM*jnrC;
1498             j_coord_offsetD  = DIM*jnrD;
1499             j_coord_offsetE  = DIM*jnrE;
1500             j_coord_offsetF  = DIM*jnrF;
1501             j_coord_offsetG  = DIM*jnrG;
1502             j_coord_offsetH  = DIM*jnrH;
1503
1504             /* load j atom coordinates */
1505             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1506                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1507                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1508                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1509                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1510                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1511
1512             /* Calculate displacement vector */
1513             dx00             = _mm256_sub_ps(ix0,jx0);
1514             dy00             = _mm256_sub_ps(iy0,jy0);
1515             dz00             = _mm256_sub_ps(iz0,jz0);
1516             dx11             = _mm256_sub_ps(ix1,jx1);
1517             dy11             = _mm256_sub_ps(iy1,jy1);
1518             dz11             = _mm256_sub_ps(iz1,jz1);
1519             dx12             = _mm256_sub_ps(ix1,jx2);
1520             dy12             = _mm256_sub_ps(iy1,jy2);
1521             dz12             = _mm256_sub_ps(iz1,jz2);
1522             dx13             = _mm256_sub_ps(ix1,jx3);
1523             dy13             = _mm256_sub_ps(iy1,jy3);
1524             dz13             = _mm256_sub_ps(iz1,jz3);
1525             dx21             = _mm256_sub_ps(ix2,jx1);
1526             dy21             = _mm256_sub_ps(iy2,jy1);
1527             dz21             = _mm256_sub_ps(iz2,jz1);
1528             dx22             = _mm256_sub_ps(ix2,jx2);
1529             dy22             = _mm256_sub_ps(iy2,jy2);
1530             dz22             = _mm256_sub_ps(iz2,jz2);
1531             dx23             = _mm256_sub_ps(ix2,jx3);
1532             dy23             = _mm256_sub_ps(iy2,jy3);
1533             dz23             = _mm256_sub_ps(iz2,jz3);
1534             dx31             = _mm256_sub_ps(ix3,jx1);
1535             dy31             = _mm256_sub_ps(iy3,jy1);
1536             dz31             = _mm256_sub_ps(iz3,jz1);
1537             dx32             = _mm256_sub_ps(ix3,jx2);
1538             dy32             = _mm256_sub_ps(iy3,jy2);
1539             dz32             = _mm256_sub_ps(iz3,jz2);
1540             dx33             = _mm256_sub_ps(ix3,jx3);
1541             dy33             = _mm256_sub_ps(iy3,jy3);
1542             dz33             = _mm256_sub_ps(iz3,jz3);
1543
1544             /* Calculate squared distance and things based on it */
1545             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1546             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1547             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1548             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1549             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1550             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1551             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1552             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1553             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1554             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1555
1556             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1557             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1558             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1559             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1560             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1561             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1562             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1563             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1564             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1565
1566             rinvsq00         = gmx_mm256_inv_ps(rsq00);
1567             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1568             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1569             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1570             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1571             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1572             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1573             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1574             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1575             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1576
1577             fjx0             = _mm256_setzero_ps();
1578             fjy0             = _mm256_setzero_ps();
1579             fjz0             = _mm256_setzero_ps();
1580             fjx1             = _mm256_setzero_ps();
1581             fjy1             = _mm256_setzero_ps();
1582             fjz1             = _mm256_setzero_ps();
1583             fjx2             = _mm256_setzero_ps();
1584             fjy2             = _mm256_setzero_ps();
1585             fjz2             = _mm256_setzero_ps();
1586             fjx3             = _mm256_setzero_ps();
1587             fjy3             = _mm256_setzero_ps();
1588             fjz3             = _mm256_setzero_ps();
1589
1590             /**************************
1591              * CALCULATE INTERACTIONS *
1592              **************************/
1593
1594             /* LENNARD-JONES DISPERSION/REPULSION */
1595
1596             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1597             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1598
1599             fscal            = fvdw;
1600
1601             /* Calculate temporary vectorial force */
1602             tx               = _mm256_mul_ps(fscal,dx00);
1603             ty               = _mm256_mul_ps(fscal,dy00);
1604             tz               = _mm256_mul_ps(fscal,dz00);
1605
1606             /* Update vectorial force */
1607             fix0             = _mm256_add_ps(fix0,tx);
1608             fiy0             = _mm256_add_ps(fiy0,ty);
1609             fiz0             = _mm256_add_ps(fiz0,tz);
1610
1611             fjx0             = _mm256_add_ps(fjx0,tx);
1612             fjy0             = _mm256_add_ps(fjy0,ty);
1613             fjz0             = _mm256_add_ps(fjz0,tz);
1614
1615             /**************************
1616              * CALCULATE INTERACTIONS *
1617              **************************/
1618
1619             r11              = _mm256_mul_ps(rsq11,rinv11);
1620
1621             /* EWALD ELECTROSTATICS */
1622             
1623             /* Analytical PME correction */
1624             zeta2            = _mm256_mul_ps(beta2,rsq11);
1625             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1626             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1627             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1628             felec            = _mm256_mul_ps(qq11,felec);
1629             
1630             fscal            = felec;
1631
1632             /* Calculate temporary vectorial force */
1633             tx               = _mm256_mul_ps(fscal,dx11);
1634             ty               = _mm256_mul_ps(fscal,dy11);
1635             tz               = _mm256_mul_ps(fscal,dz11);
1636
1637             /* Update vectorial force */
1638             fix1             = _mm256_add_ps(fix1,tx);
1639             fiy1             = _mm256_add_ps(fiy1,ty);
1640             fiz1             = _mm256_add_ps(fiz1,tz);
1641
1642             fjx1             = _mm256_add_ps(fjx1,tx);
1643             fjy1             = _mm256_add_ps(fjy1,ty);
1644             fjz1             = _mm256_add_ps(fjz1,tz);
1645
1646             /**************************
1647              * CALCULATE INTERACTIONS *
1648              **************************/
1649
1650             r12              = _mm256_mul_ps(rsq12,rinv12);
1651
1652             /* EWALD ELECTROSTATICS */
1653             
1654             /* Analytical PME correction */
1655             zeta2            = _mm256_mul_ps(beta2,rsq12);
1656             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1657             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1658             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1659             felec            = _mm256_mul_ps(qq12,felec);
1660             
1661             fscal            = felec;
1662
1663             /* Calculate temporary vectorial force */
1664             tx               = _mm256_mul_ps(fscal,dx12);
1665             ty               = _mm256_mul_ps(fscal,dy12);
1666             tz               = _mm256_mul_ps(fscal,dz12);
1667
1668             /* Update vectorial force */
1669             fix1             = _mm256_add_ps(fix1,tx);
1670             fiy1             = _mm256_add_ps(fiy1,ty);
1671             fiz1             = _mm256_add_ps(fiz1,tz);
1672
1673             fjx2             = _mm256_add_ps(fjx2,tx);
1674             fjy2             = _mm256_add_ps(fjy2,ty);
1675             fjz2             = _mm256_add_ps(fjz2,tz);
1676
1677             /**************************
1678              * CALCULATE INTERACTIONS *
1679              **************************/
1680
1681             r13              = _mm256_mul_ps(rsq13,rinv13);
1682
1683             /* EWALD ELECTROSTATICS */
1684             
1685             /* Analytical PME correction */
1686             zeta2            = _mm256_mul_ps(beta2,rsq13);
1687             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
1688             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1689             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1690             felec            = _mm256_mul_ps(qq13,felec);
1691             
1692             fscal            = felec;
1693
1694             /* Calculate temporary vectorial force */
1695             tx               = _mm256_mul_ps(fscal,dx13);
1696             ty               = _mm256_mul_ps(fscal,dy13);
1697             tz               = _mm256_mul_ps(fscal,dz13);
1698
1699             /* Update vectorial force */
1700             fix1             = _mm256_add_ps(fix1,tx);
1701             fiy1             = _mm256_add_ps(fiy1,ty);
1702             fiz1             = _mm256_add_ps(fiz1,tz);
1703
1704             fjx3             = _mm256_add_ps(fjx3,tx);
1705             fjy3             = _mm256_add_ps(fjy3,ty);
1706             fjz3             = _mm256_add_ps(fjz3,tz);
1707
1708             /**************************
1709              * CALCULATE INTERACTIONS *
1710              **************************/
1711
1712             r21              = _mm256_mul_ps(rsq21,rinv21);
1713
1714             /* EWALD ELECTROSTATICS */
1715             
1716             /* Analytical PME correction */
1717             zeta2            = _mm256_mul_ps(beta2,rsq21);
1718             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1719             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1720             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1721             felec            = _mm256_mul_ps(qq21,felec);
1722             
1723             fscal            = felec;
1724
1725             /* Calculate temporary vectorial force */
1726             tx               = _mm256_mul_ps(fscal,dx21);
1727             ty               = _mm256_mul_ps(fscal,dy21);
1728             tz               = _mm256_mul_ps(fscal,dz21);
1729
1730             /* Update vectorial force */
1731             fix2             = _mm256_add_ps(fix2,tx);
1732             fiy2             = _mm256_add_ps(fiy2,ty);
1733             fiz2             = _mm256_add_ps(fiz2,tz);
1734
1735             fjx1             = _mm256_add_ps(fjx1,tx);
1736             fjy1             = _mm256_add_ps(fjy1,ty);
1737             fjz1             = _mm256_add_ps(fjz1,tz);
1738
1739             /**************************
1740              * CALCULATE INTERACTIONS *
1741              **************************/
1742
1743             r22              = _mm256_mul_ps(rsq22,rinv22);
1744
1745             /* EWALD ELECTROSTATICS */
1746             
1747             /* Analytical PME correction */
1748             zeta2            = _mm256_mul_ps(beta2,rsq22);
1749             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1750             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1751             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1752             felec            = _mm256_mul_ps(qq22,felec);
1753             
1754             fscal            = felec;
1755
1756             /* Calculate temporary vectorial force */
1757             tx               = _mm256_mul_ps(fscal,dx22);
1758             ty               = _mm256_mul_ps(fscal,dy22);
1759             tz               = _mm256_mul_ps(fscal,dz22);
1760
1761             /* Update vectorial force */
1762             fix2             = _mm256_add_ps(fix2,tx);
1763             fiy2             = _mm256_add_ps(fiy2,ty);
1764             fiz2             = _mm256_add_ps(fiz2,tz);
1765
1766             fjx2             = _mm256_add_ps(fjx2,tx);
1767             fjy2             = _mm256_add_ps(fjy2,ty);
1768             fjz2             = _mm256_add_ps(fjz2,tz);
1769
1770             /**************************
1771              * CALCULATE INTERACTIONS *
1772              **************************/
1773
1774             r23              = _mm256_mul_ps(rsq23,rinv23);
1775
1776             /* EWALD ELECTROSTATICS */
1777             
1778             /* Analytical PME correction */
1779             zeta2            = _mm256_mul_ps(beta2,rsq23);
1780             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
1781             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1782             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1783             felec            = _mm256_mul_ps(qq23,felec);
1784             
1785             fscal            = felec;
1786
1787             /* Calculate temporary vectorial force */
1788             tx               = _mm256_mul_ps(fscal,dx23);
1789             ty               = _mm256_mul_ps(fscal,dy23);
1790             tz               = _mm256_mul_ps(fscal,dz23);
1791
1792             /* Update vectorial force */
1793             fix2             = _mm256_add_ps(fix2,tx);
1794             fiy2             = _mm256_add_ps(fiy2,ty);
1795             fiz2             = _mm256_add_ps(fiz2,tz);
1796
1797             fjx3             = _mm256_add_ps(fjx3,tx);
1798             fjy3             = _mm256_add_ps(fjy3,ty);
1799             fjz3             = _mm256_add_ps(fjz3,tz);
1800
1801             /**************************
1802              * CALCULATE INTERACTIONS *
1803              **************************/
1804
1805             r31              = _mm256_mul_ps(rsq31,rinv31);
1806
1807             /* EWALD ELECTROSTATICS */
1808             
1809             /* Analytical PME correction */
1810             zeta2            = _mm256_mul_ps(beta2,rsq31);
1811             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
1812             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1813             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1814             felec            = _mm256_mul_ps(qq31,felec);
1815             
1816             fscal            = felec;
1817
1818             /* Calculate temporary vectorial force */
1819             tx               = _mm256_mul_ps(fscal,dx31);
1820             ty               = _mm256_mul_ps(fscal,dy31);
1821             tz               = _mm256_mul_ps(fscal,dz31);
1822
1823             /* Update vectorial force */
1824             fix3             = _mm256_add_ps(fix3,tx);
1825             fiy3             = _mm256_add_ps(fiy3,ty);
1826             fiz3             = _mm256_add_ps(fiz3,tz);
1827
1828             fjx1             = _mm256_add_ps(fjx1,tx);
1829             fjy1             = _mm256_add_ps(fjy1,ty);
1830             fjz1             = _mm256_add_ps(fjz1,tz);
1831
1832             /**************************
1833              * CALCULATE INTERACTIONS *
1834              **************************/
1835
1836             r32              = _mm256_mul_ps(rsq32,rinv32);
1837
1838             /* EWALD ELECTROSTATICS */
1839             
1840             /* Analytical PME correction */
1841             zeta2            = _mm256_mul_ps(beta2,rsq32);
1842             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
1843             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1844             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1845             felec            = _mm256_mul_ps(qq32,felec);
1846             
1847             fscal            = felec;
1848
1849             /* Calculate temporary vectorial force */
1850             tx               = _mm256_mul_ps(fscal,dx32);
1851             ty               = _mm256_mul_ps(fscal,dy32);
1852             tz               = _mm256_mul_ps(fscal,dz32);
1853
1854             /* Update vectorial force */
1855             fix3             = _mm256_add_ps(fix3,tx);
1856             fiy3             = _mm256_add_ps(fiy3,ty);
1857             fiz3             = _mm256_add_ps(fiz3,tz);
1858
1859             fjx2             = _mm256_add_ps(fjx2,tx);
1860             fjy2             = _mm256_add_ps(fjy2,ty);
1861             fjz2             = _mm256_add_ps(fjz2,tz);
1862
1863             /**************************
1864              * CALCULATE INTERACTIONS *
1865              **************************/
1866
1867             r33              = _mm256_mul_ps(rsq33,rinv33);
1868
1869             /* EWALD ELECTROSTATICS */
1870             
1871             /* Analytical PME correction */
1872             zeta2            = _mm256_mul_ps(beta2,rsq33);
1873             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
1874             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1875             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1876             felec            = _mm256_mul_ps(qq33,felec);
1877             
1878             fscal            = felec;
1879
1880             /* Calculate temporary vectorial force */
1881             tx               = _mm256_mul_ps(fscal,dx33);
1882             ty               = _mm256_mul_ps(fscal,dy33);
1883             tz               = _mm256_mul_ps(fscal,dz33);
1884
1885             /* Update vectorial force */
1886             fix3             = _mm256_add_ps(fix3,tx);
1887             fiy3             = _mm256_add_ps(fiy3,ty);
1888             fiz3             = _mm256_add_ps(fiz3,tz);
1889
1890             fjx3             = _mm256_add_ps(fjx3,tx);
1891             fjy3             = _mm256_add_ps(fjy3,ty);
1892             fjz3             = _mm256_add_ps(fjz3,tz);
1893
1894             fjptrA             = f+j_coord_offsetA;
1895             fjptrB             = f+j_coord_offsetB;
1896             fjptrC             = f+j_coord_offsetC;
1897             fjptrD             = f+j_coord_offsetD;
1898             fjptrE             = f+j_coord_offsetE;
1899             fjptrF             = f+j_coord_offsetF;
1900             fjptrG             = f+j_coord_offsetG;
1901             fjptrH             = f+j_coord_offsetH;
1902
1903             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1904                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1905                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1906
1907             /* Inner loop uses 534 flops */
1908         }
1909
1910         if(jidx<j_index_end)
1911         {
1912
1913             /* Get j neighbor index, and coordinate index */
1914             jnrlistA         = jjnr[jidx];
1915             jnrlistB         = jjnr[jidx+1];
1916             jnrlistC         = jjnr[jidx+2];
1917             jnrlistD         = jjnr[jidx+3];
1918             jnrlistE         = jjnr[jidx+4];
1919             jnrlistF         = jjnr[jidx+5];
1920             jnrlistG         = jjnr[jidx+6];
1921             jnrlistH         = jjnr[jidx+7];
1922             /* Sign of each element will be negative for non-real atoms.
1923              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1924              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1925              */
1926             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1927                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1928                                             
1929             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1930             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1931             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1932             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1933             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1934             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1935             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1936             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1937             j_coord_offsetA  = DIM*jnrA;
1938             j_coord_offsetB  = DIM*jnrB;
1939             j_coord_offsetC  = DIM*jnrC;
1940             j_coord_offsetD  = DIM*jnrD;
1941             j_coord_offsetE  = DIM*jnrE;
1942             j_coord_offsetF  = DIM*jnrF;
1943             j_coord_offsetG  = DIM*jnrG;
1944             j_coord_offsetH  = DIM*jnrH;
1945
1946             /* load j atom coordinates */
1947             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1948                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1949                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1950                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1951                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1952                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1953
1954             /* Calculate displacement vector */
1955             dx00             = _mm256_sub_ps(ix0,jx0);
1956             dy00             = _mm256_sub_ps(iy0,jy0);
1957             dz00             = _mm256_sub_ps(iz0,jz0);
1958             dx11             = _mm256_sub_ps(ix1,jx1);
1959             dy11             = _mm256_sub_ps(iy1,jy1);
1960             dz11             = _mm256_sub_ps(iz1,jz1);
1961             dx12             = _mm256_sub_ps(ix1,jx2);
1962             dy12             = _mm256_sub_ps(iy1,jy2);
1963             dz12             = _mm256_sub_ps(iz1,jz2);
1964             dx13             = _mm256_sub_ps(ix1,jx3);
1965             dy13             = _mm256_sub_ps(iy1,jy3);
1966             dz13             = _mm256_sub_ps(iz1,jz3);
1967             dx21             = _mm256_sub_ps(ix2,jx1);
1968             dy21             = _mm256_sub_ps(iy2,jy1);
1969             dz21             = _mm256_sub_ps(iz2,jz1);
1970             dx22             = _mm256_sub_ps(ix2,jx2);
1971             dy22             = _mm256_sub_ps(iy2,jy2);
1972             dz22             = _mm256_sub_ps(iz2,jz2);
1973             dx23             = _mm256_sub_ps(ix2,jx3);
1974             dy23             = _mm256_sub_ps(iy2,jy3);
1975             dz23             = _mm256_sub_ps(iz2,jz3);
1976             dx31             = _mm256_sub_ps(ix3,jx1);
1977             dy31             = _mm256_sub_ps(iy3,jy1);
1978             dz31             = _mm256_sub_ps(iz3,jz1);
1979             dx32             = _mm256_sub_ps(ix3,jx2);
1980             dy32             = _mm256_sub_ps(iy3,jy2);
1981             dz32             = _mm256_sub_ps(iz3,jz2);
1982             dx33             = _mm256_sub_ps(ix3,jx3);
1983             dy33             = _mm256_sub_ps(iy3,jy3);
1984             dz33             = _mm256_sub_ps(iz3,jz3);
1985
1986             /* Calculate squared distance and things based on it */
1987             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1988             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1989             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1990             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1991             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1992             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1993             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1994             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1995             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1996             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1997
1998             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1999             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2000             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
2001             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2002             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2003             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
2004             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
2005             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
2006             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
2007
2008             rinvsq00         = gmx_mm256_inv_ps(rsq00);
2009             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2010             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2011             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
2012             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2013             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2014             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
2015             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
2016             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
2017             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
2018
2019             fjx0             = _mm256_setzero_ps();
2020             fjy0             = _mm256_setzero_ps();
2021             fjz0             = _mm256_setzero_ps();
2022             fjx1             = _mm256_setzero_ps();
2023             fjy1             = _mm256_setzero_ps();
2024             fjz1             = _mm256_setzero_ps();
2025             fjx2             = _mm256_setzero_ps();
2026             fjy2             = _mm256_setzero_ps();
2027             fjz2             = _mm256_setzero_ps();
2028             fjx3             = _mm256_setzero_ps();
2029             fjy3             = _mm256_setzero_ps();
2030             fjz3             = _mm256_setzero_ps();
2031
2032             /**************************
2033              * CALCULATE INTERACTIONS *
2034              **************************/
2035
2036             /* LENNARD-JONES DISPERSION/REPULSION */
2037
2038             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2039             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2040
2041             fscal            = fvdw;
2042
2043             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2044
2045             /* Calculate temporary vectorial force */
2046             tx               = _mm256_mul_ps(fscal,dx00);
2047             ty               = _mm256_mul_ps(fscal,dy00);
2048             tz               = _mm256_mul_ps(fscal,dz00);
2049
2050             /* Update vectorial force */
2051             fix0             = _mm256_add_ps(fix0,tx);
2052             fiy0             = _mm256_add_ps(fiy0,ty);
2053             fiz0             = _mm256_add_ps(fiz0,tz);
2054
2055             fjx0             = _mm256_add_ps(fjx0,tx);
2056             fjy0             = _mm256_add_ps(fjy0,ty);
2057             fjz0             = _mm256_add_ps(fjz0,tz);
2058
2059             /**************************
2060              * CALCULATE INTERACTIONS *
2061              **************************/
2062
2063             r11              = _mm256_mul_ps(rsq11,rinv11);
2064             r11              = _mm256_andnot_ps(dummy_mask,r11);
2065
2066             /* EWALD ELECTROSTATICS */
2067             
2068             /* Analytical PME correction */
2069             zeta2            = _mm256_mul_ps(beta2,rsq11);
2070             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2071             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2072             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2073             felec            = _mm256_mul_ps(qq11,felec);
2074             
2075             fscal            = felec;
2076
2077             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2078
2079             /* Calculate temporary vectorial force */
2080             tx               = _mm256_mul_ps(fscal,dx11);
2081             ty               = _mm256_mul_ps(fscal,dy11);
2082             tz               = _mm256_mul_ps(fscal,dz11);
2083
2084             /* Update vectorial force */
2085             fix1             = _mm256_add_ps(fix1,tx);
2086             fiy1             = _mm256_add_ps(fiy1,ty);
2087             fiz1             = _mm256_add_ps(fiz1,tz);
2088
2089             fjx1             = _mm256_add_ps(fjx1,tx);
2090             fjy1             = _mm256_add_ps(fjy1,ty);
2091             fjz1             = _mm256_add_ps(fjz1,tz);
2092
2093             /**************************
2094              * CALCULATE INTERACTIONS *
2095              **************************/
2096
2097             r12              = _mm256_mul_ps(rsq12,rinv12);
2098             r12              = _mm256_andnot_ps(dummy_mask,r12);
2099
2100             /* EWALD ELECTROSTATICS */
2101             
2102             /* Analytical PME correction */
2103             zeta2            = _mm256_mul_ps(beta2,rsq12);
2104             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2105             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2106             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2107             felec            = _mm256_mul_ps(qq12,felec);
2108             
2109             fscal            = felec;
2110
2111             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2112
2113             /* Calculate temporary vectorial force */
2114             tx               = _mm256_mul_ps(fscal,dx12);
2115             ty               = _mm256_mul_ps(fscal,dy12);
2116             tz               = _mm256_mul_ps(fscal,dz12);
2117
2118             /* Update vectorial force */
2119             fix1             = _mm256_add_ps(fix1,tx);
2120             fiy1             = _mm256_add_ps(fiy1,ty);
2121             fiz1             = _mm256_add_ps(fiz1,tz);
2122
2123             fjx2             = _mm256_add_ps(fjx2,tx);
2124             fjy2             = _mm256_add_ps(fjy2,ty);
2125             fjz2             = _mm256_add_ps(fjz2,tz);
2126
2127             /**************************
2128              * CALCULATE INTERACTIONS *
2129              **************************/
2130
2131             r13              = _mm256_mul_ps(rsq13,rinv13);
2132             r13              = _mm256_andnot_ps(dummy_mask,r13);
2133
2134             /* EWALD ELECTROSTATICS */
2135             
2136             /* Analytical PME correction */
2137             zeta2            = _mm256_mul_ps(beta2,rsq13);
2138             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
2139             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2140             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2141             felec            = _mm256_mul_ps(qq13,felec);
2142             
2143             fscal            = felec;
2144
2145             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2146
2147             /* Calculate temporary vectorial force */
2148             tx               = _mm256_mul_ps(fscal,dx13);
2149             ty               = _mm256_mul_ps(fscal,dy13);
2150             tz               = _mm256_mul_ps(fscal,dz13);
2151
2152             /* Update vectorial force */
2153             fix1             = _mm256_add_ps(fix1,tx);
2154             fiy1             = _mm256_add_ps(fiy1,ty);
2155             fiz1             = _mm256_add_ps(fiz1,tz);
2156
2157             fjx3             = _mm256_add_ps(fjx3,tx);
2158             fjy3             = _mm256_add_ps(fjy3,ty);
2159             fjz3             = _mm256_add_ps(fjz3,tz);
2160
2161             /**************************
2162              * CALCULATE INTERACTIONS *
2163              **************************/
2164
2165             r21              = _mm256_mul_ps(rsq21,rinv21);
2166             r21              = _mm256_andnot_ps(dummy_mask,r21);
2167
2168             /* EWALD ELECTROSTATICS */
2169             
2170             /* Analytical PME correction */
2171             zeta2            = _mm256_mul_ps(beta2,rsq21);
2172             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2173             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2174             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2175             felec            = _mm256_mul_ps(qq21,felec);
2176             
2177             fscal            = felec;
2178
2179             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2180
2181             /* Calculate temporary vectorial force */
2182             tx               = _mm256_mul_ps(fscal,dx21);
2183             ty               = _mm256_mul_ps(fscal,dy21);
2184             tz               = _mm256_mul_ps(fscal,dz21);
2185
2186             /* Update vectorial force */
2187             fix2             = _mm256_add_ps(fix2,tx);
2188             fiy2             = _mm256_add_ps(fiy2,ty);
2189             fiz2             = _mm256_add_ps(fiz2,tz);
2190
2191             fjx1             = _mm256_add_ps(fjx1,tx);
2192             fjy1             = _mm256_add_ps(fjy1,ty);
2193             fjz1             = _mm256_add_ps(fjz1,tz);
2194
2195             /**************************
2196              * CALCULATE INTERACTIONS *
2197              **************************/
2198
2199             r22              = _mm256_mul_ps(rsq22,rinv22);
2200             r22              = _mm256_andnot_ps(dummy_mask,r22);
2201
2202             /* EWALD ELECTROSTATICS */
2203             
2204             /* Analytical PME correction */
2205             zeta2            = _mm256_mul_ps(beta2,rsq22);
2206             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2207             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2208             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2209             felec            = _mm256_mul_ps(qq22,felec);
2210             
2211             fscal            = felec;
2212
2213             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2214
2215             /* Calculate temporary vectorial force */
2216             tx               = _mm256_mul_ps(fscal,dx22);
2217             ty               = _mm256_mul_ps(fscal,dy22);
2218             tz               = _mm256_mul_ps(fscal,dz22);
2219
2220             /* Update vectorial force */
2221             fix2             = _mm256_add_ps(fix2,tx);
2222             fiy2             = _mm256_add_ps(fiy2,ty);
2223             fiz2             = _mm256_add_ps(fiz2,tz);
2224
2225             fjx2             = _mm256_add_ps(fjx2,tx);
2226             fjy2             = _mm256_add_ps(fjy2,ty);
2227             fjz2             = _mm256_add_ps(fjz2,tz);
2228
2229             /**************************
2230              * CALCULATE INTERACTIONS *
2231              **************************/
2232
2233             r23              = _mm256_mul_ps(rsq23,rinv23);
2234             r23              = _mm256_andnot_ps(dummy_mask,r23);
2235
2236             /* EWALD ELECTROSTATICS */
2237             
2238             /* Analytical PME correction */
2239             zeta2            = _mm256_mul_ps(beta2,rsq23);
2240             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
2241             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2242             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2243             felec            = _mm256_mul_ps(qq23,felec);
2244             
2245             fscal            = felec;
2246
2247             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2248
2249             /* Calculate temporary vectorial force */
2250             tx               = _mm256_mul_ps(fscal,dx23);
2251             ty               = _mm256_mul_ps(fscal,dy23);
2252             tz               = _mm256_mul_ps(fscal,dz23);
2253
2254             /* Update vectorial force */
2255             fix2             = _mm256_add_ps(fix2,tx);
2256             fiy2             = _mm256_add_ps(fiy2,ty);
2257             fiz2             = _mm256_add_ps(fiz2,tz);
2258
2259             fjx3             = _mm256_add_ps(fjx3,tx);
2260             fjy3             = _mm256_add_ps(fjy3,ty);
2261             fjz3             = _mm256_add_ps(fjz3,tz);
2262
2263             /**************************
2264              * CALCULATE INTERACTIONS *
2265              **************************/
2266
2267             r31              = _mm256_mul_ps(rsq31,rinv31);
2268             r31              = _mm256_andnot_ps(dummy_mask,r31);
2269
2270             /* EWALD ELECTROSTATICS */
2271             
2272             /* Analytical PME correction */
2273             zeta2            = _mm256_mul_ps(beta2,rsq31);
2274             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
2275             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2276             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2277             felec            = _mm256_mul_ps(qq31,felec);
2278             
2279             fscal            = felec;
2280
2281             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2282
2283             /* Calculate temporary vectorial force */
2284             tx               = _mm256_mul_ps(fscal,dx31);
2285             ty               = _mm256_mul_ps(fscal,dy31);
2286             tz               = _mm256_mul_ps(fscal,dz31);
2287
2288             /* Update vectorial force */
2289             fix3             = _mm256_add_ps(fix3,tx);
2290             fiy3             = _mm256_add_ps(fiy3,ty);
2291             fiz3             = _mm256_add_ps(fiz3,tz);
2292
2293             fjx1             = _mm256_add_ps(fjx1,tx);
2294             fjy1             = _mm256_add_ps(fjy1,ty);
2295             fjz1             = _mm256_add_ps(fjz1,tz);
2296
2297             /**************************
2298              * CALCULATE INTERACTIONS *
2299              **************************/
2300
2301             r32              = _mm256_mul_ps(rsq32,rinv32);
2302             r32              = _mm256_andnot_ps(dummy_mask,r32);
2303
2304             /* EWALD ELECTROSTATICS */
2305             
2306             /* Analytical PME correction */
2307             zeta2            = _mm256_mul_ps(beta2,rsq32);
2308             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
2309             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2310             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2311             felec            = _mm256_mul_ps(qq32,felec);
2312             
2313             fscal            = felec;
2314
2315             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2316
2317             /* Calculate temporary vectorial force */
2318             tx               = _mm256_mul_ps(fscal,dx32);
2319             ty               = _mm256_mul_ps(fscal,dy32);
2320             tz               = _mm256_mul_ps(fscal,dz32);
2321
2322             /* Update vectorial force */
2323             fix3             = _mm256_add_ps(fix3,tx);
2324             fiy3             = _mm256_add_ps(fiy3,ty);
2325             fiz3             = _mm256_add_ps(fiz3,tz);
2326
2327             fjx2             = _mm256_add_ps(fjx2,tx);
2328             fjy2             = _mm256_add_ps(fjy2,ty);
2329             fjz2             = _mm256_add_ps(fjz2,tz);
2330
2331             /**************************
2332              * CALCULATE INTERACTIONS *
2333              **************************/
2334
2335             r33              = _mm256_mul_ps(rsq33,rinv33);
2336             r33              = _mm256_andnot_ps(dummy_mask,r33);
2337
2338             /* EWALD ELECTROSTATICS */
2339             
2340             /* Analytical PME correction */
2341             zeta2            = _mm256_mul_ps(beta2,rsq33);
2342             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
2343             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2344             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2345             felec            = _mm256_mul_ps(qq33,felec);
2346             
2347             fscal            = felec;
2348
2349             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2350
2351             /* Calculate temporary vectorial force */
2352             tx               = _mm256_mul_ps(fscal,dx33);
2353             ty               = _mm256_mul_ps(fscal,dy33);
2354             tz               = _mm256_mul_ps(fscal,dz33);
2355
2356             /* Update vectorial force */
2357             fix3             = _mm256_add_ps(fix3,tx);
2358             fiy3             = _mm256_add_ps(fiy3,ty);
2359             fiz3             = _mm256_add_ps(fiz3,tz);
2360
2361             fjx3             = _mm256_add_ps(fjx3,tx);
2362             fjy3             = _mm256_add_ps(fjy3,ty);
2363             fjz3             = _mm256_add_ps(fjz3,tz);
2364
2365             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2366             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2367             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2368             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2369             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2370             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2371             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2372             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2373
2374             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2375                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2376                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2377
2378             /* Inner loop uses 543 flops */
2379         }
2380
2381         /* End of innermost loop */
2382
2383         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2384                                                  f+i_coord_offset,fshift+i_shift_offset);
2385
2386         /* Increment number of inner iterations */
2387         inneriter                  += j_index_end - j_index_start;
2388
2389         /* Outer loop uses 24 flops */
2390     }
2391
2392     /* Increment number of outer iterations */
2393     outeriter        += nri;
2394
2395     /* Update outer/inner flops */
2396
2397     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*543);
2398 }