made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_avx_256_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrE,jnrF,jnrG,jnrH;
62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
67     real             rcutoff_scalar;
68     real             *shiftvec,*fshift,*x,*f;
69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
70     real             scratch[4*DIM];
71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72     real *           vdwioffsetptr0;
73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74     real *           vdwioffsetptr1;
75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76     real *           vdwioffsetptr2;
77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
94     real             *charge;
95     int              nvdwtype;
96     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97     int              *vdwtype;
98     real             *vdwparam;
99     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
100     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
101     __m256i          ewitab;
102     __m128i          ewitab_lo,ewitab_hi;
103     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
104     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
105     real             *ewtab;
106     __m256           dummy_mask,cutoff_mask;
107     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
108     __m256           one     = _mm256_set1_ps(1.0);
109     __m256           two     = _mm256_set1_ps(2.0);
110     x                = xx[0];
111     f                = ff[0];
112
113     nri              = nlist->nri;
114     iinr             = nlist->iinr;
115     jindex           = nlist->jindex;
116     jjnr             = nlist->jjnr;
117     shiftidx         = nlist->shift;
118     gid              = nlist->gid;
119     shiftvec         = fr->shift_vec[0];
120     fshift           = fr->fshift[0];
121     facel            = _mm256_set1_ps(fr->epsfac);
122     charge           = mdatoms->chargeA;
123     nvdwtype         = fr->ntype;
124     vdwparam         = fr->nbfp;
125     vdwtype          = mdatoms->typeA;
126
127     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
128     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
129     beta2            = _mm256_mul_ps(beta,beta);
130     beta3            = _mm256_mul_ps(beta,beta2);
131
132     ewtab            = fr->ic->tabq_coul_FDV0;
133     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
134     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
135
136     /* Setup water-specific parameters */
137     inr              = nlist->iinr[0];
138     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
139     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
140     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
141     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
142
143     jq0              = _mm256_set1_ps(charge[inr+0]);
144     jq1              = _mm256_set1_ps(charge[inr+1]);
145     jq2              = _mm256_set1_ps(charge[inr+2]);
146     vdwjidx0A        = 2*vdwtype[inr+0];
147     qq00             = _mm256_mul_ps(iq0,jq0);
148     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
149     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
150     qq01             = _mm256_mul_ps(iq0,jq1);
151     qq02             = _mm256_mul_ps(iq0,jq2);
152     qq10             = _mm256_mul_ps(iq1,jq0);
153     qq11             = _mm256_mul_ps(iq1,jq1);
154     qq12             = _mm256_mul_ps(iq1,jq2);
155     qq20             = _mm256_mul_ps(iq2,jq0);
156     qq21             = _mm256_mul_ps(iq2,jq1);
157     qq22             = _mm256_mul_ps(iq2,jq2);
158
159     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
160     rcutoff_scalar   = fr->rcoulomb;
161     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
162     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
163
164     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
165     rvdw             = _mm256_set1_ps(fr->rvdw);
166
167     /* Avoid stupid compiler warnings */
168     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
169     j_coord_offsetA = 0;
170     j_coord_offsetB = 0;
171     j_coord_offsetC = 0;
172     j_coord_offsetD = 0;
173     j_coord_offsetE = 0;
174     j_coord_offsetF = 0;
175     j_coord_offsetG = 0;
176     j_coord_offsetH = 0;
177
178     outeriter        = 0;
179     inneriter        = 0;
180
181     for(iidx=0;iidx<4*DIM;iidx++)
182     {
183         scratch[iidx] = 0.0;
184     }
185
186     /* Start outer loop over neighborlists */
187     for(iidx=0; iidx<nri; iidx++)
188     {
189         /* Load shift vector for this list */
190         i_shift_offset   = DIM*shiftidx[iidx];
191
192         /* Load limits for loop over neighbors */
193         j_index_start    = jindex[iidx];
194         j_index_end      = jindex[iidx+1];
195
196         /* Get outer coordinate index */
197         inr              = iinr[iidx];
198         i_coord_offset   = DIM*inr;
199
200         /* Load i particle coords and add shift vector */
201         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
202                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
203
204         fix0             = _mm256_setzero_ps();
205         fiy0             = _mm256_setzero_ps();
206         fiz0             = _mm256_setzero_ps();
207         fix1             = _mm256_setzero_ps();
208         fiy1             = _mm256_setzero_ps();
209         fiz1             = _mm256_setzero_ps();
210         fix2             = _mm256_setzero_ps();
211         fiy2             = _mm256_setzero_ps();
212         fiz2             = _mm256_setzero_ps();
213
214         /* Reset potential sums */
215         velecsum         = _mm256_setzero_ps();
216         vvdwsum          = _mm256_setzero_ps();
217
218         /* Start inner kernel loop */
219         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
220         {
221
222             /* Get j neighbor index, and coordinate index */
223             jnrA             = jjnr[jidx];
224             jnrB             = jjnr[jidx+1];
225             jnrC             = jjnr[jidx+2];
226             jnrD             = jjnr[jidx+3];
227             jnrE             = jjnr[jidx+4];
228             jnrF             = jjnr[jidx+5];
229             jnrG             = jjnr[jidx+6];
230             jnrH             = jjnr[jidx+7];
231             j_coord_offsetA  = DIM*jnrA;
232             j_coord_offsetB  = DIM*jnrB;
233             j_coord_offsetC  = DIM*jnrC;
234             j_coord_offsetD  = DIM*jnrD;
235             j_coord_offsetE  = DIM*jnrE;
236             j_coord_offsetF  = DIM*jnrF;
237             j_coord_offsetG  = DIM*jnrG;
238             j_coord_offsetH  = DIM*jnrH;
239
240             /* load j atom coordinates */
241             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
242                                                  x+j_coord_offsetC,x+j_coord_offsetD,
243                                                  x+j_coord_offsetE,x+j_coord_offsetF,
244                                                  x+j_coord_offsetG,x+j_coord_offsetH,
245                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
246
247             /* Calculate displacement vector */
248             dx00             = _mm256_sub_ps(ix0,jx0);
249             dy00             = _mm256_sub_ps(iy0,jy0);
250             dz00             = _mm256_sub_ps(iz0,jz0);
251             dx01             = _mm256_sub_ps(ix0,jx1);
252             dy01             = _mm256_sub_ps(iy0,jy1);
253             dz01             = _mm256_sub_ps(iz0,jz1);
254             dx02             = _mm256_sub_ps(ix0,jx2);
255             dy02             = _mm256_sub_ps(iy0,jy2);
256             dz02             = _mm256_sub_ps(iz0,jz2);
257             dx10             = _mm256_sub_ps(ix1,jx0);
258             dy10             = _mm256_sub_ps(iy1,jy0);
259             dz10             = _mm256_sub_ps(iz1,jz0);
260             dx11             = _mm256_sub_ps(ix1,jx1);
261             dy11             = _mm256_sub_ps(iy1,jy1);
262             dz11             = _mm256_sub_ps(iz1,jz1);
263             dx12             = _mm256_sub_ps(ix1,jx2);
264             dy12             = _mm256_sub_ps(iy1,jy2);
265             dz12             = _mm256_sub_ps(iz1,jz2);
266             dx20             = _mm256_sub_ps(ix2,jx0);
267             dy20             = _mm256_sub_ps(iy2,jy0);
268             dz20             = _mm256_sub_ps(iz2,jz0);
269             dx21             = _mm256_sub_ps(ix2,jx1);
270             dy21             = _mm256_sub_ps(iy2,jy1);
271             dz21             = _mm256_sub_ps(iz2,jz1);
272             dx22             = _mm256_sub_ps(ix2,jx2);
273             dy22             = _mm256_sub_ps(iy2,jy2);
274             dz22             = _mm256_sub_ps(iz2,jz2);
275
276             /* Calculate squared distance and things based on it */
277             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
278             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
279             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
280             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
281             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
282             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
283             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
284             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
285             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
286
287             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
288             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
289             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
290             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
291             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
292             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
293             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
294             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
295             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
296
297             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
298             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
299             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
300             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
301             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
302             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
303             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
304             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
305             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
306
307             fjx0             = _mm256_setzero_ps();
308             fjy0             = _mm256_setzero_ps();
309             fjz0             = _mm256_setzero_ps();
310             fjx1             = _mm256_setzero_ps();
311             fjy1             = _mm256_setzero_ps();
312             fjz1             = _mm256_setzero_ps();
313             fjx2             = _mm256_setzero_ps();
314             fjy2             = _mm256_setzero_ps();
315             fjz2             = _mm256_setzero_ps();
316
317             /**************************
318              * CALCULATE INTERACTIONS *
319              **************************/
320
321             if (gmx_mm256_any_lt(rsq00,rcutoff2))
322             {
323
324             r00              = _mm256_mul_ps(rsq00,rinv00);
325
326             /* EWALD ELECTROSTATICS */
327             
328             /* Analytical PME correction */
329             zeta2            = _mm256_mul_ps(beta2,rsq00);
330             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
331             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
332             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
333             felec            = _mm256_mul_ps(qq00,felec);
334             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
335             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
336             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
337             velec            = _mm256_mul_ps(qq00,velec);
338             
339             /* LENNARD-JONES DISPERSION/REPULSION */
340
341             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
342             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
343             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
344             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
345                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
346             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
347
348             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
349
350             /* Update potential sum for this i atom from the interaction with this j atom. */
351             velec            = _mm256_and_ps(velec,cutoff_mask);
352             velecsum         = _mm256_add_ps(velecsum,velec);
353             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
354             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
355
356             fscal            = _mm256_add_ps(felec,fvdw);
357
358             fscal            = _mm256_and_ps(fscal,cutoff_mask);
359
360             /* Calculate temporary vectorial force */
361             tx               = _mm256_mul_ps(fscal,dx00);
362             ty               = _mm256_mul_ps(fscal,dy00);
363             tz               = _mm256_mul_ps(fscal,dz00);
364
365             /* Update vectorial force */
366             fix0             = _mm256_add_ps(fix0,tx);
367             fiy0             = _mm256_add_ps(fiy0,ty);
368             fiz0             = _mm256_add_ps(fiz0,tz);
369
370             fjx0             = _mm256_add_ps(fjx0,tx);
371             fjy0             = _mm256_add_ps(fjy0,ty);
372             fjz0             = _mm256_add_ps(fjz0,tz);
373
374             }
375
376             /**************************
377              * CALCULATE INTERACTIONS *
378              **************************/
379
380             if (gmx_mm256_any_lt(rsq01,rcutoff2))
381             {
382
383             r01              = _mm256_mul_ps(rsq01,rinv01);
384
385             /* EWALD ELECTROSTATICS */
386             
387             /* Analytical PME correction */
388             zeta2            = _mm256_mul_ps(beta2,rsq01);
389             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
390             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
391             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
392             felec            = _mm256_mul_ps(qq01,felec);
393             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
394             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
395             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv01,sh_ewald),pmecorrV);
396             velec            = _mm256_mul_ps(qq01,velec);
397             
398             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
399
400             /* Update potential sum for this i atom from the interaction with this j atom. */
401             velec            = _mm256_and_ps(velec,cutoff_mask);
402             velecsum         = _mm256_add_ps(velecsum,velec);
403
404             fscal            = felec;
405
406             fscal            = _mm256_and_ps(fscal,cutoff_mask);
407
408             /* Calculate temporary vectorial force */
409             tx               = _mm256_mul_ps(fscal,dx01);
410             ty               = _mm256_mul_ps(fscal,dy01);
411             tz               = _mm256_mul_ps(fscal,dz01);
412
413             /* Update vectorial force */
414             fix0             = _mm256_add_ps(fix0,tx);
415             fiy0             = _mm256_add_ps(fiy0,ty);
416             fiz0             = _mm256_add_ps(fiz0,tz);
417
418             fjx1             = _mm256_add_ps(fjx1,tx);
419             fjy1             = _mm256_add_ps(fjy1,ty);
420             fjz1             = _mm256_add_ps(fjz1,tz);
421
422             }
423
424             /**************************
425              * CALCULATE INTERACTIONS *
426              **************************/
427
428             if (gmx_mm256_any_lt(rsq02,rcutoff2))
429             {
430
431             r02              = _mm256_mul_ps(rsq02,rinv02);
432
433             /* EWALD ELECTROSTATICS */
434             
435             /* Analytical PME correction */
436             zeta2            = _mm256_mul_ps(beta2,rsq02);
437             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
438             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
439             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
440             felec            = _mm256_mul_ps(qq02,felec);
441             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
442             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
443             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv02,sh_ewald),pmecorrV);
444             velec            = _mm256_mul_ps(qq02,velec);
445             
446             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
447
448             /* Update potential sum for this i atom from the interaction with this j atom. */
449             velec            = _mm256_and_ps(velec,cutoff_mask);
450             velecsum         = _mm256_add_ps(velecsum,velec);
451
452             fscal            = felec;
453
454             fscal            = _mm256_and_ps(fscal,cutoff_mask);
455
456             /* Calculate temporary vectorial force */
457             tx               = _mm256_mul_ps(fscal,dx02);
458             ty               = _mm256_mul_ps(fscal,dy02);
459             tz               = _mm256_mul_ps(fscal,dz02);
460
461             /* Update vectorial force */
462             fix0             = _mm256_add_ps(fix0,tx);
463             fiy0             = _mm256_add_ps(fiy0,ty);
464             fiz0             = _mm256_add_ps(fiz0,tz);
465
466             fjx2             = _mm256_add_ps(fjx2,tx);
467             fjy2             = _mm256_add_ps(fjy2,ty);
468             fjz2             = _mm256_add_ps(fjz2,tz);
469
470             }
471
472             /**************************
473              * CALCULATE INTERACTIONS *
474              **************************/
475
476             if (gmx_mm256_any_lt(rsq10,rcutoff2))
477             {
478
479             r10              = _mm256_mul_ps(rsq10,rinv10);
480
481             /* EWALD ELECTROSTATICS */
482             
483             /* Analytical PME correction */
484             zeta2            = _mm256_mul_ps(beta2,rsq10);
485             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
486             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
487             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
488             felec            = _mm256_mul_ps(qq10,felec);
489             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
490             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
491             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
492             velec            = _mm256_mul_ps(qq10,velec);
493             
494             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
495
496             /* Update potential sum for this i atom from the interaction with this j atom. */
497             velec            = _mm256_and_ps(velec,cutoff_mask);
498             velecsum         = _mm256_add_ps(velecsum,velec);
499
500             fscal            = felec;
501
502             fscal            = _mm256_and_ps(fscal,cutoff_mask);
503
504             /* Calculate temporary vectorial force */
505             tx               = _mm256_mul_ps(fscal,dx10);
506             ty               = _mm256_mul_ps(fscal,dy10);
507             tz               = _mm256_mul_ps(fscal,dz10);
508
509             /* Update vectorial force */
510             fix1             = _mm256_add_ps(fix1,tx);
511             fiy1             = _mm256_add_ps(fiy1,ty);
512             fiz1             = _mm256_add_ps(fiz1,tz);
513
514             fjx0             = _mm256_add_ps(fjx0,tx);
515             fjy0             = _mm256_add_ps(fjy0,ty);
516             fjz0             = _mm256_add_ps(fjz0,tz);
517
518             }
519
520             /**************************
521              * CALCULATE INTERACTIONS *
522              **************************/
523
524             if (gmx_mm256_any_lt(rsq11,rcutoff2))
525             {
526
527             r11              = _mm256_mul_ps(rsq11,rinv11);
528
529             /* EWALD ELECTROSTATICS */
530             
531             /* Analytical PME correction */
532             zeta2            = _mm256_mul_ps(beta2,rsq11);
533             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
534             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
535             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
536             felec            = _mm256_mul_ps(qq11,felec);
537             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
538             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
539             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
540             velec            = _mm256_mul_ps(qq11,velec);
541             
542             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
543
544             /* Update potential sum for this i atom from the interaction with this j atom. */
545             velec            = _mm256_and_ps(velec,cutoff_mask);
546             velecsum         = _mm256_add_ps(velecsum,velec);
547
548             fscal            = felec;
549
550             fscal            = _mm256_and_ps(fscal,cutoff_mask);
551
552             /* Calculate temporary vectorial force */
553             tx               = _mm256_mul_ps(fscal,dx11);
554             ty               = _mm256_mul_ps(fscal,dy11);
555             tz               = _mm256_mul_ps(fscal,dz11);
556
557             /* Update vectorial force */
558             fix1             = _mm256_add_ps(fix1,tx);
559             fiy1             = _mm256_add_ps(fiy1,ty);
560             fiz1             = _mm256_add_ps(fiz1,tz);
561
562             fjx1             = _mm256_add_ps(fjx1,tx);
563             fjy1             = _mm256_add_ps(fjy1,ty);
564             fjz1             = _mm256_add_ps(fjz1,tz);
565
566             }
567
568             /**************************
569              * CALCULATE INTERACTIONS *
570              **************************/
571
572             if (gmx_mm256_any_lt(rsq12,rcutoff2))
573             {
574
575             r12              = _mm256_mul_ps(rsq12,rinv12);
576
577             /* EWALD ELECTROSTATICS */
578             
579             /* Analytical PME correction */
580             zeta2            = _mm256_mul_ps(beta2,rsq12);
581             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
582             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
583             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
584             felec            = _mm256_mul_ps(qq12,felec);
585             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
586             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
587             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
588             velec            = _mm256_mul_ps(qq12,velec);
589             
590             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
591
592             /* Update potential sum for this i atom from the interaction with this j atom. */
593             velec            = _mm256_and_ps(velec,cutoff_mask);
594             velecsum         = _mm256_add_ps(velecsum,velec);
595
596             fscal            = felec;
597
598             fscal            = _mm256_and_ps(fscal,cutoff_mask);
599
600             /* Calculate temporary vectorial force */
601             tx               = _mm256_mul_ps(fscal,dx12);
602             ty               = _mm256_mul_ps(fscal,dy12);
603             tz               = _mm256_mul_ps(fscal,dz12);
604
605             /* Update vectorial force */
606             fix1             = _mm256_add_ps(fix1,tx);
607             fiy1             = _mm256_add_ps(fiy1,ty);
608             fiz1             = _mm256_add_ps(fiz1,tz);
609
610             fjx2             = _mm256_add_ps(fjx2,tx);
611             fjy2             = _mm256_add_ps(fjy2,ty);
612             fjz2             = _mm256_add_ps(fjz2,tz);
613
614             }
615
616             /**************************
617              * CALCULATE INTERACTIONS *
618              **************************/
619
620             if (gmx_mm256_any_lt(rsq20,rcutoff2))
621             {
622
623             r20              = _mm256_mul_ps(rsq20,rinv20);
624
625             /* EWALD ELECTROSTATICS */
626             
627             /* Analytical PME correction */
628             zeta2            = _mm256_mul_ps(beta2,rsq20);
629             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
630             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
631             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
632             felec            = _mm256_mul_ps(qq20,felec);
633             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
634             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
635             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
636             velec            = _mm256_mul_ps(qq20,velec);
637             
638             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
639
640             /* Update potential sum for this i atom from the interaction with this j atom. */
641             velec            = _mm256_and_ps(velec,cutoff_mask);
642             velecsum         = _mm256_add_ps(velecsum,velec);
643
644             fscal            = felec;
645
646             fscal            = _mm256_and_ps(fscal,cutoff_mask);
647
648             /* Calculate temporary vectorial force */
649             tx               = _mm256_mul_ps(fscal,dx20);
650             ty               = _mm256_mul_ps(fscal,dy20);
651             tz               = _mm256_mul_ps(fscal,dz20);
652
653             /* Update vectorial force */
654             fix2             = _mm256_add_ps(fix2,tx);
655             fiy2             = _mm256_add_ps(fiy2,ty);
656             fiz2             = _mm256_add_ps(fiz2,tz);
657
658             fjx0             = _mm256_add_ps(fjx0,tx);
659             fjy0             = _mm256_add_ps(fjy0,ty);
660             fjz0             = _mm256_add_ps(fjz0,tz);
661
662             }
663
664             /**************************
665              * CALCULATE INTERACTIONS *
666              **************************/
667
668             if (gmx_mm256_any_lt(rsq21,rcutoff2))
669             {
670
671             r21              = _mm256_mul_ps(rsq21,rinv21);
672
673             /* EWALD ELECTROSTATICS */
674             
675             /* Analytical PME correction */
676             zeta2            = _mm256_mul_ps(beta2,rsq21);
677             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
678             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
679             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
680             felec            = _mm256_mul_ps(qq21,felec);
681             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
682             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
683             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
684             velec            = _mm256_mul_ps(qq21,velec);
685             
686             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
687
688             /* Update potential sum for this i atom from the interaction with this j atom. */
689             velec            = _mm256_and_ps(velec,cutoff_mask);
690             velecsum         = _mm256_add_ps(velecsum,velec);
691
692             fscal            = felec;
693
694             fscal            = _mm256_and_ps(fscal,cutoff_mask);
695
696             /* Calculate temporary vectorial force */
697             tx               = _mm256_mul_ps(fscal,dx21);
698             ty               = _mm256_mul_ps(fscal,dy21);
699             tz               = _mm256_mul_ps(fscal,dz21);
700
701             /* Update vectorial force */
702             fix2             = _mm256_add_ps(fix2,tx);
703             fiy2             = _mm256_add_ps(fiy2,ty);
704             fiz2             = _mm256_add_ps(fiz2,tz);
705
706             fjx1             = _mm256_add_ps(fjx1,tx);
707             fjy1             = _mm256_add_ps(fjy1,ty);
708             fjz1             = _mm256_add_ps(fjz1,tz);
709
710             }
711
712             /**************************
713              * CALCULATE INTERACTIONS *
714              **************************/
715
716             if (gmx_mm256_any_lt(rsq22,rcutoff2))
717             {
718
719             r22              = _mm256_mul_ps(rsq22,rinv22);
720
721             /* EWALD ELECTROSTATICS */
722             
723             /* Analytical PME correction */
724             zeta2            = _mm256_mul_ps(beta2,rsq22);
725             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
726             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
727             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
728             felec            = _mm256_mul_ps(qq22,felec);
729             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
730             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
731             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
732             velec            = _mm256_mul_ps(qq22,velec);
733             
734             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
735
736             /* Update potential sum for this i atom from the interaction with this j atom. */
737             velec            = _mm256_and_ps(velec,cutoff_mask);
738             velecsum         = _mm256_add_ps(velecsum,velec);
739
740             fscal            = felec;
741
742             fscal            = _mm256_and_ps(fscal,cutoff_mask);
743
744             /* Calculate temporary vectorial force */
745             tx               = _mm256_mul_ps(fscal,dx22);
746             ty               = _mm256_mul_ps(fscal,dy22);
747             tz               = _mm256_mul_ps(fscal,dz22);
748
749             /* Update vectorial force */
750             fix2             = _mm256_add_ps(fix2,tx);
751             fiy2             = _mm256_add_ps(fiy2,ty);
752             fiz2             = _mm256_add_ps(fiz2,tz);
753
754             fjx2             = _mm256_add_ps(fjx2,tx);
755             fjy2             = _mm256_add_ps(fjy2,ty);
756             fjz2             = _mm256_add_ps(fjz2,tz);
757
758             }
759
760             fjptrA             = f+j_coord_offsetA;
761             fjptrB             = f+j_coord_offsetB;
762             fjptrC             = f+j_coord_offsetC;
763             fjptrD             = f+j_coord_offsetD;
764             fjptrE             = f+j_coord_offsetE;
765             fjptrF             = f+j_coord_offsetF;
766             fjptrG             = f+j_coord_offsetG;
767             fjptrH             = f+j_coord_offsetH;
768
769             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
770                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
771
772             /* Inner loop uses 999 flops */
773         }
774
775         if(jidx<j_index_end)
776         {
777
778             /* Get j neighbor index, and coordinate index */
779             jnrlistA         = jjnr[jidx];
780             jnrlistB         = jjnr[jidx+1];
781             jnrlistC         = jjnr[jidx+2];
782             jnrlistD         = jjnr[jidx+3];
783             jnrlistE         = jjnr[jidx+4];
784             jnrlistF         = jjnr[jidx+5];
785             jnrlistG         = jjnr[jidx+6];
786             jnrlistH         = jjnr[jidx+7];
787             /* Sign of each element will be negative for non-real atoms.
788              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
789              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
790              */
791             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
792                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
793                                             
794             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
795             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
796             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
797             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
798             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
799             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
800             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
801             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
802             j_coord_offsetA  = DIM*jnrA;
803             j_coord_offsetB  = DIM*jnrB;
804             j_coord_offsetC  = DIM*jnrC;
805             j_coord_offsetD  = DIM*jnrD;
806             j_coord_offsetE  = DIM*jnrE;
807             j_coord_offsetF  = DIM*jnrF;
808             j_coord_offsetG  = DIM*jnrG;
809             j_coord_offsetH  = DIM*jnrH;
810
811             /* load j atom coordinates */
812             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
813                                                  x+j_coord_offsetC,x+j_coord_offsetD,
814                                                  x+j_coord_offsetE,x+j_coord_offsetF,
815                                                  x+j_coord_offsetG,x+j_coord_offsetH,
816                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
817
818             /* Calculate displacement vector */
819             dx00             = _mm256_sub_ps(ix0,jx0);
820             dy00             = _mm256_sub_ps(iy0,jy0);
821             dz00             = _mm256_sub_ps(iz0,jz0);
822             dx01             = _mm256_sub_ps(ix0,jx1);
823             dy01             = _mm256_sub_ps(iy0,jy1);
824             dz01             = _mm256_sub_ps(iz0,jz1);
825             dx02             = _mm256_sub_ps(ix0,jx2);
826             dy02             = _mm256_sub_ps(iy0,jy2);
827             dz02             = _mm256_sub_ps(iz0,jz2);
828             dx10             = _mm256_sub_ps(ix1,jx0);
829             dy10             = _mm256_sub_ps(iy1,jy0);
830             dz10             = _mm256_sub_ps(iz1,jz0);
831             dx11             = _mm256_sub_ps(ix1,jx1);
832             dy11             = _mm256_sub_ps(iy1,jy1);
833             dz11             = _mm256_sub_ps(iz1,jz1);
834             dx12             = _mm256_sub_ps(ix1,jx2);
835             dy12             = _mm256_sub_ps(iy1,jy2);
836             dz12             = _mm256_sub_ps(iz1,jz2);
837             dx20             = _mm256_sub_ps(ix2,jx0);
838             dy20             = _mm256_sub_ps(iy2,jy0);
839             dz20             = _mm256_sub_ps(iz2,jz0);
840             dx21             = _mm256_sub_ps(ix2,jx1);
841             dy21             = _mm256_sub_ps(iy2,jy1);
842             dz21             = _mm256_sub_ps(iz2,jz1);
843             dx22             = _mm256_sub_ps(ix2,jx2);
844             dy22             = _mm256_sub_ps(iy2,jy2);
845             dz22             = _mm256_sub_ps(iz2,jz2);
846
847             /* Calculate squared distance and things based on it */
848             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
849             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
850             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
851             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
852             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
853             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
854             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
855             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
856             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
857
858             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
859             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
860             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
861             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
862             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
863             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
864             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
865             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
866             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
867
868             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
869             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
870             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
871             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
872             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
873             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
874             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
875             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
876             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
877
878             fjx0             = _mm256_setzero_ps();
879             fjy0             = _mm256_setzero_ps();
880             fjz0             = _mm256_setzero_ps();
881             fjx1             = _mm256_setzero_ps();
882             fjy1             = _mm256_setzero_ps();
883             fjz1             = _mm256_setzero_ps();
884             fjx2             = _mm256_setzero_ps();
885             fjy2             = _mm256_setzero_ps();
886             fjz2             = _mm256_setzero_ps();
887
888             /**************************
889              * CALCULATE INTERACTIONS *
890              **************************/
891
892             if (gmx_mm256_any_lt(rsq00,rcutoff2))
893             {
894
895             r00              = _mm256_mul_ps(rsq00,rinv00);
896             r00              = _mm256_andnot_ps(dummy_mask,r00);
897
898             /* EWALD ELECTROSTATICS */
899             
900             /* Analytical PME correction */
901             zeta2            = _mm256_mul_ps(beta2,rsq00);
902             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
903             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
904             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
905             felec            = _mm256_mul_ps(qq00,felec);
906             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
907             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
908             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
909             velec            = _mm256_mul_ps(qq00,velec);
910             
911             /* LENNARD-JONES DISPERSION/REPULSION */
912
913             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
914             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
915             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
916             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
917                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
918             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
919
920             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
921
922             /* Update potential sum for this i atom from the interaction with this j atom. */
923             velec            = _mm256_and_ps(velec,cutoff_mask);
924             velec            = _mm256_andnot_ps(dummy_mask,velec);
925             velecsum         = _mm256_add_ps(velecsum,velec);
926             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
927             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
928             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
929
930             fscal            = _mm256_add_ps(felec,fvdw);
931
932             fscal            = _mm256_and_ps(fscal,cutoff_mask);
933
934             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
935
936             /* Calculate temporary vectorial force */
937             tx               = _mm256_mul_ps(fscal,dx00);
938             ty               = _mm256_mul_ps(fscal,dy00);
939             tz               = _mm256_mul_ps(fscal,dz00);
940
941             /* Update vectorial force */
942             fix0             = _mm256_add_ps(fix0,tx);
943             fiy0             = _mm256_add_ps(fiy0,ty);
944             fiz0             = _mm256_add_ps(fiz0,tz);
945
946             fjx0             = _mm256_add_ps(fjx0,tx);
947             fjy0             = _mm256_add_ps(fjy0,ty);
948             fjz0             = _mm256_add_ps(fjz0,tz);
949
950             }
951
952             /**************************
953              * CALCULATE INTERACTIONS *
954              **************************/
955
956             if (gmx_mm256_any_lt(rsq01,rcutoff2))
957             {
958
959             r01              = _mm256_mul_ps(rsq01,rinv01);
960             r01              = _mm256_andnot_ps(dummy_mask,r01);
961
962             /* EWALD ELECTROSTATICS */
963             
964             /* Analytical PME correction */
965             zeta2            = _mm256_mul_ps(beta2,rsq01);
966             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
967             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
968             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
969             felec            = _mm256_mul_ps(qq01,felec);
970             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
971             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
972             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv01,sh_ewald),pmecorrV);
973             velec            = _mm256_mul_ps(qq01,velec);
974             
975             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
976
977             /* Update potential sum for this i atom from the interaction with this j atom. */
978             velec            = _mm256_and_ps(velec,cutoff_mask);
979             velec            = _mm256_andnot_ps(dummy_mask,velec);
980             velecsum         = _mm256_add_ps(velecsum,velec);
981
982             fscal            = felec;
983
984             fscal            = _mm256_and_ps(fscal,cutoff_mask);
985
986             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
987
988             /* Calculate temporary vectorial force */
989             tx               = _mm256_mul_ps(fscal,dx01);
990             ty               = _mm256_mul_ps(fscal,dy01);
991             tz               = _mm256_mul_ps(fscal,dz01);
992
993             /* Update vectorial force */
994             fix0             = _mm256_add_ps(fix0,tx);
995             fiy0             = _mm256_add_ps(fiy0,ty);
996             fiz0             = _mm256_add_ps(fiz0,tz);
997
998             fjx1             = _mm256_add_ps(fjx1,tx);
999             fjy1             = _mm256_add_ps(fjy1,ty);
1000             fjz1             = _mm256_add_ps(fjz1,tz);
1001
1002             }
1003
1004             /**************************
1005              * CALCULATE INTERACTIONS *
1006              **************************/
1007
1008             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1009             {
1010
1011             r02              = _mm256_mul_ps(rsq02,rinv02);
1012             r02              = _mm256_andnot_ps(dummy_mask,r02);
1013
1014             /* EWALD ELECTROSTATICS */
1015             
1016             /* Analytical PME correction */
1017             zeta2            = _mm256_mul_ps(beta2,rsq02);
1018             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1019             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1020             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1021             felec            = _mm256_mul_ps(qq02,felec);
1022             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1023             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1024             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv02,sh_ewald),pmecorrV);
1025             velec            = _mm256_mul_ps(qq02,velec);
1026             
1027             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1028
1029             /* Update potential sum for this i atom from the interaction with this j atom. */
1030             velec            = _mm256_and_ps(velec,cutoff_mask);
1031             velec            = _mm256_andnot_ps(dummy_mask,velec);
1032             velecsum         = _mm256_add_ps(velecsum,velec);
1033
1034             fscal            = felec;
1035
1036             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1037
1038             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1039
1040             /* Calculate temporary vectorial force */
1041             tx               = _mm256_mul_ps(fscal,dx02);
1042             ty               = _mm256_mul_ps(fscal,dy02);
1043             tz               = _mm256_mul_ps(fscal,dz02);
1044
1045             /* Update vectorial force */
1046             fix0             = _mm256_add_ps(fix0,tx);
1047             fiy0             = _mm256_add_ps(fiy0,ty);
1048             fiz0             = _mm256_add_ps(fiz0,tz);
1049
1050             fjx2             = _mm256_add_ps(fjx2,tx);
1051             fjy2             = _mm256_add_ps(fjy2,ty);
1052             fjz2             = _mm256_add_ps(fjz2,tz);
1053
1054             }
1055
1056             /**************************
1057              * CALCULATE INTERACTIONS *
1058              **************************/
1059
1060             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1061             {
1062
1063             r10              = _mm256_mul_ps(rsq10,rinv10);
1064             r10              = _mm256_andnot_ps(dummy_mask,r10);
1065
1066             /* EWALD ELECTROSTATICS */
1067             
1068             /* Analytical PME correction */
1069             zeta2            = _mm256_mul_ps(beta2,rsq10);
1070             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1071             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1072             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1073             felec            = _mm256_mul_ps(qq10,felec);
1074             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1075             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1076             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
1077             velec            = _mm256_mul_ps(qq10,velec);
1078             
1079             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1080
1081             /* Update potential sum for this i atom from the interaction with this j atom. */
1082             velec            = _mm256_and_ps(velec,cutoff_mask);
1083             velec            = _mm256_andnot_ps(dummy_mask,velec);
1084             velecsum         = _mm256_add_ps(velecsum,velec);
1085
1086             fscal            = felec;
1087
1088             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1089
1090             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1091
1092             /* Calculate temporary vectorial force */
1093             tx               = _mm256_mul_ps(fscal,dx10);
1094             ty               = _mm256_mul_ps(fscal,dy10);
1095             tz               = _mm256_mul_ps(fscal,dz10);
1096
1097             /* Update vectorial force */
1098             fix1             = _mm256_add_ps(fix1,tx);
1099             fiy1             = _mm256_add_ps(fiy1,ty);
1100             fiz1             = _mm256_add_ps(fiz1,tz);
1101
1102             fjx0             = _mm256_add_ps(fjx0,tx);
1103             fjy0             = _mm256_add_ps(fjy0,ty);
1104             fjz0             = _mm256_add_ps(fjz0,tz);
1105
1106             }
1107
1108             /**************************
1109              * CALCULATE INTERACTIONS *
1110              **************************/
1111
1112             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1113             {
1114
1115             r11              = _mm256_mul_ps(rsq11,rinv11);
1116             r11              = _mm256_andnot_ps(dummy_mask,r11);
1117
1118             /* EWALD ELECTROSTATICS */
1119             
1120             /* Analytical PME correction */
1121             zeta2            = _mm256_mul_ps(beta2,rsq11);
1122             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1123             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1124             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1125             felec            = _mm256_mul_ps(qq11,felec);
1126             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1127             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1128             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
1129             velec            = _mm256_mul_ps(qq11,velec);
1130             
1131             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1132
1133             /* Update potential sum for this i atom from the interaction with this j atom. */
1134             velec            = _mm256_and_ps(velec,cutoff_mask);
1135             velec            = _mm256_andnot_ps(dummy_mask,velec);
1136             velecsum         = _mm256_add_ps(velecsum,velec);
1137
1138             fscal            = felec;
1139
1140             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1141
1142             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1143
1144             /* Calculate temporary vectorial force */
1145             tx               = _mm256_mul_ps(fscal,dx11);
1146             ty               = _mm256_mul_ps(fscal,dy11);
1147             tz               = _mm256_mul_ps(fscal,dz11);
1148
1149             /* Update vectorial force */
1150             fix1             = _mm256_add_ps(fix1,tx);
1151             fiy1             = _mm256_add_ps(fiy1,ty);
1152             fiz1             = _mm256_add_ps(fiz1,tz);
1153
1154             fjx1             = _mm256_add_ps(fjx1,tx);
1155             fjy1             = _mm256_add_ps(fjy1,ty);
1156             fjz1             = _mm256_add_ps(fjz1,tz);
1157
1158             }
1159
1160             /**************************
1161              * CALCULATE INTERACTIONS *
1162              **************************/
1163
1164             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1165             {
1166
1167             r12              = _mm256_mul_ps(rsq12,rinv12);
1168             r12              = _mm256_andnot_ps(dummy_mask,r12);
1169
1170             /* EWALD ELECTROSTATICS */
1171             
1172             /* Analytical PME correction */
1173             zeta2            = _mm256_mul_ps(beta2,rsq12);
1174             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1175             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1176             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1177             felec            = _mm256_mul_ps(qq12,felec);
1178             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1179             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1180             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
1181             velec            = _mm256_mul_ps(qq12,velec);
1182             
1183             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1184
1185             /* Update potential sum for this i atom from the interaction with this j atom. */
1186             velec            = _mm256_and_ps(velec,cutoff_mask);
1187             velec            = _mm256_andnot_ps(dummy_mask,velec);
1188             velecsum         = _mm256_add_ps(velecsum,velec);
1189
1190             fscal            = felec;
1191
1192             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1193
1194             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1195
1196             /* Calculate temporary vectorial force */
1197             tx               = _mm256_mul_ps(fscal,dx12);
1198             ty               = _mm256_mul_ps(fscal,dy12);
1199             tz               = _mm256_mul_ps(fscal,dz12);
1200
1201             /* Update vectorial force */
1202             fix1             = _mm256_add_ps(fix1,tx);
1203             fiy1             = _mm256_add_ps(fiy1,ty);
1204             fiz1             = _mm256_add_ps(fiz1,tz);
1205
1206             fjx2             = _mm256_add_ps(fjx2,tx);
1207             fjy2             = _mm256_add_ps(fjy2,ty);
1208             fjz2             = _mm256_add_ps(fjz2,tz);
1209
1210             }
1211
1212             /**************************
1213              * CALCULATE INTERACTIONS *
1214              **************************/
1215
1216             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1217             {
1218
1219             r20              = _mm256_mul_ps(rsq20,rinv20);
1220             r20              = _mm256_andnot_ps(dummy_mask,r20);
1221
1222             /* EWALD ELECTROSTATICS */
1223             
1224             /* Analytical PME correction */
1225             zeta2            = _mm256_mul_ps(beta2,rsq20);
1226             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1227             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1228             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1229             felec            = _mm256_mul_ps(qq20,felec);
1230             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1231             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1232             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
1233             velec            = _mm256_mul_ps(qq20,velec);
1234             
1235             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1236
1237             /* Update potential sum for this i atom from the interaction with this j atom. */
1238             velec            = _mm256_and_ps(velec,cutoff_mask);
1239             velec            = _mm256_andnot_ps(dummy_mask,velec);
1240             velecsum         = _mm256_add_ps(velecsum,velec);
1241
1242             fscal            = felec;
1243
1244             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1245
1246             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1247
1248             /* Calculate temporary vectorial force */
1249             tx               = _mm256_mul_ps(fscal,dx20);
1250             ty               = _mm256_mul_ps(fscal,dy20);
1251             tz               = _mm256_mul_ps(fscal,dz20);
1252
1253             /* Update vectorial force */
1254             fix2             = _mm256_add_ps(fix2,tx);
1255             fiy2             = _mm256_add_ps(fiy2,ty);
1256             fiz2             = _mm256_add_ps(fiz2,tz);
1257
1258             fjx0             = _mm256_add_ps(fjx0,tx);
1259             fjy0             = _mm256_add_ps(fjy0,ty);
1260             fjz0             = _mm256_add_ps(fjz0,tz);
1261
1262             }
1263
1264             /**************************
1265              * CALCULATE INTERACTIONS *
1266              **************************/
1267
1268             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1269             {
1270
1271             r21              = _mm256_mul_ps(rsq21,rinv21);
1272             r21              = _mm256_andnot_ps(dummy_mask,r21);
1273
1274             /* EWALD ELECTROSTATICS */
1275             
1276             /* Analytical PME correction */
1277             zeta2            = _mm256_mul_ps(beta2,rsq21);
1278             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1279             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1280             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1281             felec            = _mm256_mul_ps(qq21,felec);
1282             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1283             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1284             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
1285             velec            = _mm256_mul_ps(qq21,velec);
1286             
1287             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1288
1289             /* Update potential sum for this i atom from the interaction with this j atom. */
1290             velec            = _mm256_and_ps(velec,cutoff_mask);
1291             velec            = _mm256_andnot_ps(dummy_mask,velec);
1292             velecsum         = _mm256_add_ps(velecsum,velec);
1293
1294             fscal            = felec;
1295
1296             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1297
1298             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1299
1300             /* Calculate temporary vectorial force */
1301             tx               = _mm256_mul_ps(fscal,dx21);
1302             ty               = _mm256_mul_ps(fscal,dy21);
1303             tz               = _mm256_mul_ps(fscal,dz21);
1304
1305             /* Update vectorial force */
1306             fix2             = _mm256_add_ps(fix2,tx);
1307             fiy2             = _mm256_add_ps(fiy2,ty);
1308             fiz2             = _mm256_add_ps(fiz2,tz);
1309
1310             fjx1             = _mm256_add_ps(fjx1,tx);
1311             fjy1             = _mm256_add_ps(fjy1,ty);
1312             fjz1             = _mm256_add_ps(fjz1,tz);
1313
1314             }
1315
1316             /**************************
1317              * CALCULATE INTERACTIONS *
1318              **************************/
1319
1320             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1321             {
1322
1323             r22              = _mm256_mul_ps(rsq22,rinv22);
1324             r22              = _mm256_andnot_ps(dummy_mask,r22);
1325
1326             /* EWALD ELECTROSTATICS */
1327             
1328             /* Analytical PME correction */
1329             zeta2            = _mm256_mul_ps(beta2,rsq22);
1330             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1331             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1332             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1333             felec            = _mm256_mul_ps(qq22,felec);
1334             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1335             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1336             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
1337             velec            = _mm256_mul_ps(qq22,velec);
1338             
1339             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1340
1341             /* Update potential sum for this i atom from the interaction with this j atom. */
1342             velec            = _mm256_and_ps(velec,cutoff_mask);
1343             velec            = _mm256_andnot_ps(dummy_mask,velec);
1344             velecsum         = _mm256_add_ps(velecsum,velec);
1345
1346             fscal            = felec;
1347
1348             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1349
1350             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1351
1352             /* Calculate temporary vectorial force */
1353             tx               = _mm256_mul_ps(fscal,dx22);
1354             ty               = _mm256_mul_ps(fscal,dy22);
1355             tz               = _mm256_mul_ps(fscal,dz22);
1356
1357             /* Update vectorial force */
1358             fix2             = _mm256_add_ps(fix2,tx);
1359             fiy2             = _mm256_add_ps(fiy2,ty);
1360             fiz2             = _mm256_add_ps(fiz2,tz);
1361
1362             fjx2             = _mm256_add_ps(fjx2,tx);
1363             fjy2             = _mm256_add_ps(fjy2,ty);
1364             fjz2             = _mm256_add_ps(fjz2,tz);
1365
1366             }
1367
1368             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1369             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1370             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1371             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1372             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1373             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1374             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1375             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1376
1377             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1378                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1379
1380             /* Inner loop uses 1008 flops */
1381         }
1382
1383         /* End of innermost loop */
1384
1385         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1386                                                  f+i_coord_offset,fshift+i_shift_offset);
1387
1388         ggid                        = gid[iidx];
1389         /* Update potential energies */
1390         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1391         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1392
1393         /* Increment number of inner iterations */
1394         inneriter                  += j_index_end - j_index_start;
1395
1396         /* Outer loop uses 20 flops */
1397     }
1398
1399     /* Increment number of outer iterations */
1400     outeriter        += nri;
1401
1402     /* Update outer/inner flops */
1403
1404     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*1008);
1405 }
1406 /*
1407  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_single
1408  * Electrostatics interaction: Ewald
1409  * VdW interaction:            LennardJones
1410  * Geometry:                   Water3-Water3
1411  * Calculate force/pot:        Force
1412  */
1413 void
1414 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_single
1415                     (t_nblist * gmx_restrict                nlist,
1416                      rvec * gmx_restrict                    xx,
1417                      rvec * gmx_restrict                    ff,
1418                      t_forcerec * gmx_restrict              fr,
1419                      t_mdatoms * gmx_restrict               mdatoms,
1420                      nb_kernel_data_t * gmx_restrict        kernel_data,
1421                      t_nrnb * gmx_restrict                  nrnb)
1422 {
1423     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1424      * just 0 for non-waters.
1425      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1426      * jnr indices corresponding to data put in the four positions in the SIMD register.
1427      */
1428     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1429     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1430     int              jnrA,jnrB,jnrC,jnrD;
1431     int              jnrE,jnrF,jnrG,jnrH;
1432     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1433     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1434     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1435     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1436     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1437     real             rcutoff_scalar;
1438     real             *shiftvec,*fshift,*x,*f;
1439     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1440     real             scratch[4*DIM];
1441     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1442     real *           vdwioffsetptr0;
1443     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1444     real *           vdwioffsetptr1;
1445     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1446     real *           vdwioffsetptr2;
1447     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1448     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1449     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1450     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1451     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1452     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1453     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1454     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1455     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1456     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1457     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1458     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1459     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1460     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1461     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1462     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1463     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1464     real             *charge;
1465     int              nvdwtype;
1466     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1467     int              *vdwtype;
1468     real             *vdwparam;
1469     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1470     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1471     __m256i          ewitab;
1472     __m128i          ewitab_lo,ewitab_hi;
1473     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1474     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1475     real             *ewtab;
1476     __m256           dummy_mask,cutoff_mask;
1477     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1478     __m256           one     = _mm256_set1_ps(1.0);
1479     __m256           two     = _mm256_set1_ps(2.0);
1480     x                = xx[0];
1481     f                = ff[0];
1482
1483     nri              = nlist->nri;
1484     iinr             = nlist->iinr;
1485     jindex           = nlist->jindex;
1486     jjnr             = nlist->jjnr;
1487     shiftidx         = nlist->shift;
1488     gid              = nlist->gid;
1489     shiftvec         = fr->shift_vec[0];
1490     fshift           = fr->fshift[0];
1491     facel            = _mm256_set1_ps(fr->epsfac);
1492     charge           = mdatoms->chargeA;
1493     nvdwtype         = fr->ntype;
1494     vdwparam         = fr->nbfp;
1495     vdwtype          = mdatoms->typeA;
1496
1497     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1498     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
1499     beta2            = _mm256_mul_ps(beta,beta);
1500     beta3            = _mm256_mul_ps(beta,beta2);
1501
1502     ewtab            = fr->ic->tabq_coul_F;
1503     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1504     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1505
1506     /* Setup water-specific parameters */
1507     inr              = nlist->iinr[0];
1508     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1509     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1510     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1511     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1512
1513     jq0              = _mm256_set1_ps(charge[inr+0]);
1514     jq1              = _mm256_set1_ps(charge[inr+1]);
1515     jq2              = _mm256_set1_ps(charge[inr+2]);
1516     vdwjidx0A        = 2*vdwtype[inr+0];
1517     qq00             = _mm256_mul_ps(iq0,jq0);
1518     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1519     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1520     qq01             = _mm256_mul_ps(iq0,jq1);
1521     qq02             = _mm256_mul_ps(iq0,jq2);
1522     qq10             = _mm256_mul_ps(iq1,jq0);
1523     qq11             = _mm256_mul_ps(iq1,jq1);
1524     qq12             = _mm256_mul_ps(iq1,jq2);
1525     qq20             = _mm256_mul_ps(iq2,jq0);
1526     qq21             = _mm256_mul_ps(iq2,jq1);
1527     qq22             = _mm256_mul_ps(iq2,jq2);
1528
1529     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1530     rcutoff_scalar   = fr->rcoulomb;
1531     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1532     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1533
1534     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
1535     rvdw             = _mm256_set1_ps(fr->rvdw);
1536
1537     /* Avoid stupid compiler warnings */
1538     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1539     j_coord_offsetA = 0;
1540     j_coord_offsetB = 0;
1541     j_coord_offsetC = 0;
1542     j_coord_offsetD = 0;
1543     j_coord_offsetE = 0;
1544     j_coord_offsetF = 0;
1545     j_coord_offsetG = 0;
1546     j_coord_offsetH = 0;
1547
1548     outeriter        = 0;
1549     inneriter        = 0;
1550
1551     for(iidx=0;iidx<4*DIM;iidx++)
1552     {
1553         scratch[iidx] = 0.0;
1554     }
1555
1556     /* Start outer loop over neighborlists */
1557     for(iidx=0; iidx<nri; iidx++)
1558     {
1559         /* Load shift vector for this list */
1560         i_shift_offset   = DIM*shiftidx[iidx];
1561
1562         /* Load limits for loop over neighbors */
1563         j_index_start    = jindex[iidx];
1564         j_index_end      = jindex[iidx+1];
1565
1566         /* Get outer coordinate index */
1567         inr              = iinr[iidx];
1568         i_coord_offset   = DIM*inr;
1569
1570         /* Load i particle coords and add shift vector */
1571         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1572                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1573
1574         fix0             = _mm256_setzero_ps();
1575         fiy0             = _mm256_setzero_ps();
1576         fiz0             = _mm256_setzero_ps();
1577         fix1             = _mm256_setzero_ps();
1578         fiy1             = _mm256_setzero_ps();
1579         fiz1             = _mm256_setzero_ps();
1580         fix2             = _mm256_setzero_ps();
1581         fiy2             = _mm256_setzero_ps();
1582         fiz2             = _mm256_setzero_ps();
1583
1584         /* Start inner kernel loop */
1585         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1586         {
1587
1588             /* Get j neighbor index, and coordinate index */
1589             jnrA             = jjnr[jidx];
1590             jnrB             = jjnr[jidx+1];
1591             jnrC             = jjnr[jidx+2];
1592             jnrD             = jjnr[jidx+3];
1593             jnrE             = jjnr[jidx+4];
1594             jnrF             = jjnr[jidx+5];
1595             jnrG             = jjnr[jidx+6];
1596             jnrH             = jjnr[jidx+7];
1597             j_coord_offsetA  = DIM*jnrA;
1598             j_coord_offsetB  = DIM*jnrB;
1599             j_coord_offsetC  = DIM*jnrC;
1600             j_coord_offsetD  = DIM*jnrD;
1601             j_coord_offsetE  = DIM*jnrE;
1602             j_coord_offsetF  = DIM*jnrF;
1603             j_coord_offsetG  = DIM*jnrG;
1604             j_coord_offsetH  = DIM*jnrH;
1605
1606             /* load j atom coordinates */
1607             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1608                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1609                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1610                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1611                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1612
1613             /* Calculate displacement vector */
1614             dx00             = _mm256_sub_ps(ix0,jx0);
1615             dy00             = _mm256_sub_ps(iy0,jy0);
1616             dz00             = _mm256_sub_ps(iz0,jz0);
1617             dx01             = _mm256_sub_ps(ix0,jx1);
1618             dy01             = _mm256_sub_ps(iy0,jy1);
1619             dz01             = _mm256_sub_ps(iz0,jz1);
1620             dx02             = _mm256_sub_ps(ix0,jx2);
1621             dy02             = _mm256_sub_ps(iy0,jy2);
1622             dz02             = _mm256_sub_ps(iz0,jz2);
1623             dx10             = _mm256_sub_ps(ix1,jx0);
1624             dy10             = _mm256_sub_ps(iy1,jy0);
1625             dz10             = _mm256_sub_ps(iz1,jz0);
1626             dx11             = _mm256_sub_ps(ix1,jx1);
1627             dy11             = _mm256_sub_ps(iy1,jy1);
1628             dz11             = _mm256_sub_ps(iz1,jz1);
1629             dx12             = _mm256_sub_ps(ix1,jx2);
1630             dy12             = _mm256_sub_ps(iy1,jy2);
1631             dz12             = _mm256_sub_ps(iz1,jz2);
1632             dx20             = _mm256_sub_ps(ix2,jx0);
1633             dy20             = _mm256_sub_ps(iy2,jy0);
1634             dz20             = _mm256_sub_ps(iz2,jz0);
1635             dx21             = _mm256_sub_ps(ix2,jx1);
1636             dy21             = _mm256_sub_ps(iy2,jy1);
1637             dz21             = _mm256_sub_ps(iz2,jz1);
1638             dx22             = _mm256_sub_ps(ix2,jx2);
1639             dy22             = _mm256_sub_ps(iy2,jy2);
1640             dz22             = _mm256_sub_ps(iz2,jz2);
1641
1642             /* Calculate squared distance and things based on it */
1643             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1644             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1645             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1646             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1647             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1648             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1649             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1650             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1651             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1652
1653             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1654             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1655             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1656             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1657             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1658             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1659             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1660             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1661             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1662
1663             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1664             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1665             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1666             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1667             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1668             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1669             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1670             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1671             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1672
1673             fjx0             = _mm256_setzero_ps();
1674             fjy0             = _mm256_setzero_ps();
1675             fjz0             = _mm256_setzero_ps();
1676             fjx1             = _mm256_setzero_ps();
1677             fjy1             = _mm256_setzero_ps();
1678             fjz1             = _mm256_setzero_ps();
1679             fjx2             = _mm256_setzero_ps();
1680             fjy2             = _mm256_setzero_ps();
1681             fjz2             = _mm256_setzero_ps();
1682
1683             /**************************
1684              * CALCULATE INTERACTIONS *
1685              **************************/
1686
1687             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1688             {
1689
1690             r00              = _mm256_mul_ps(rsq00,rinv00);
1691
1692             /* EWALD ELECTROSTATICS */
1693             
1694             /* Analytical PME correction */
1695             zeta2            = _mm256_mul_ps(beta2,rsq00);
1696             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1697             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1698             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1699             felec            = _mm256_mul_ps(qq00,felec);
1700             
1701             /* LENNARD-JONES DISPERSION/REPULSION */
1702
1703             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1704             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1705
1706             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1707
1708             fscal            = _mm256_add_ps(felec,fvdw);
1709
1710             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1711
1712             /* Calculate temporary vectorial force */
1713             tx               = _mm256_mul_ps(fscal,dx00);
1714             ty               = _mm256_mul_ps(fscal,dy00);
1715             tz               = _mm256_mul_ps(fscal,dz00);
1716
1717             /* Update vectorial force */
1718             fix0             = _mm256_add_ps(fix0,tx);
1719             fiy0             = _mm256_add_ps(fiy0,ty);
1720             fiz0             = _mm256_add_ps(fiz0,tz);
1721
1722             fjx0             = _mm256_add_ps(fjx0,tx);
1723             fjy0             = _mm256_add_ps(fjy0,ty);
1724             fjz0             = _mm256_add_ps(fjz0,tz);
1725
1726             }
1727
1728             /**************************
1729              * CALCULATE INTERACTIONS *
1730              **************************/
1731
1732             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1733             {
1734
1735             r01              = _mm256_mul_ps(rsq01,rinv01);
1736
1737             /* EWALD ELECTROSTATICS */
1738             
1739             /* Analytical PME correction */
1740             zeta2            = _mm256_mul_ps(beta2,rsq01);
1741             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1742             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1743             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1744             felec            = _mm256_mul_ps(qq01,felec);
1745             
1746             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1747
1748             fscal            = felec;
1749
1750             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1751
1752             /* Calculate temporary vectorial force */
1753             tx               = _mm256_mul_ps(fscal,dx01);
1754             ty               = _mm256_mul_ps(fscal,dy01);
1755             tz               = _mm256_mul_ps(fscal,dz01);
1756
1757             /* Update vectorial force */
1758             fix0             = _mm256_add_ps(fix0,tx);
1759             fiy0             = _mm256_add_ps(fiy0,ty);
1760             fiz0             = _mm256_add_ps(fiz0,tz);
1761
1762             fjx1             = _mm256_add_ps(fjx1,tx);
1763             fjy1             = _mm256_add_ps(fjy1,ty);
1764             fjz1             = _mm256_add_ps(fjz1,tz);
1765
1766             }
1767
1768             /**************************
1769              * CALCULATE INTERACTIONS *
1770              **************************/
1771
1772             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1773             {
1774
1775             r02              = _mm256_mul_ps(rsq02,rinv02);
1776
1777             /* EWALD ELECTROSTATICS */
1778             
1779             /* Analytical PME correction */
1780             zeta2            = _mm256_mul_ps(beta2,rsq02);
1781             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1782             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1783             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1784             felec            = _mm256_mul_ps(qq02,felec);
1785             
1786             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1787
1788             fscal            = felec;
1789
1790             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1791
1792             /* Calculate temporary vectorial force */
1793             tx               = _mm256_mul_ps(fscal,dx02);
1794             ty               = _mm256_mul_ps(fscal,dy02);
1795             tz               = _mm256_mul_ps(fscal,dz02);
1796
1797             /* Update vectorial force */
1798             fix0             = _mm256_add_ps(fix0,tx);
1799             fiy0             = _mm256_add_ps(fiy0,ty);
1800             fiz0             = _mm256_add_ps(fiz0,tz);
1801
1802             fjx2             = _mm256_add_ps(fjx2,tx);
1803             fjy2             = _mm256_add_ps(fjy2,ty);
1804             fjz2             = _mm256_add_ps(fjz2,tz);
1805
1806             }
1807
1808             /**************************
1809              * CALCULATE INTERACTIONS *
1810              **************************/
1811
1812             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1813             {
1814
1815             r10              = _mm256_mul_ps(rsq10,rinv10);
1816
1817             /* EWALD ELECTROSTATICS */
1818             
1819             /* Analytical PME correction */
1820             zeta2            = _mm256_mul_ps(beta2,rsq10);
1821             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1822             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1823             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1824             felec            = _mm256_mul_ps(qq10,felec);
1825             
1826             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1827
1828             fscal            = felec;
1829
1830             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1831
1832             /* Calculate temporary vectorial force */
1833             tx               = _mm256_mul_ps(fscal,dx10);
1834             ty               = _mm256_mul_ps(fscal,dy10);
1835             tz               = _mm256_mul_ps(fscal,dz10);
1836
1837             /* Update vectorial force */
1838             fix1             = _mm256_add_ps(fix1,tx);
1839             fiy1             = _mm256_add_ps(fiy1,ty);
1840             fiz1             = _mm256_add_ps(fiz1,tz);
1841
1842             fjx0             = _mm256_add_ps(fjx0,tx);
1843             fjy0             = _mm256_add_ps(fjy0,ty);
1844             fjz0             = _mm256_add_ps(fjz0,tz);
1845
1846             }
1847
1848             /**************************
1849              * CALCULATE INTERACTIONS *
1850              **************************/
1851
1852             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1853             {
1854
1855             r11              = _mm256_mul_ps(rsq11,rinv11);
1856
1857             /* EWALD ELECTROSTATICS */
1858             
1859             /* Analytical PME correction */
1860             zeta2            = _mm256_mul_ps(beta2,rsq11);
1861             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1862             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1863             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1864             felec            = _mm256_mul_ps(qq11,felec);
1865             
1866             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1867
1868             fscal            = felec;
1869
1870             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1871
1872             /* Calculate temporary vectorial force */
1873             tx               = _mm256_mul_ps(fscal,dx11);
1874             ty               = _mm256_mul_ps(fscal,dy11);
1875             tz               = _mm256_mul_ps(fscal,dz11);
1876
1877             /* Update vectorial force */
1878             fix1             = _mm256_add_ps(fix1,tx);
1879             fiy1             = _mm256_add_ps(fiy1,ty);
1880             fiz1             = _mm256_add_ps(fiz1,tz);
1881
1882             fjx1             = _mm256_add_ps(fjx1,tx);
1883             fjy1             = _mm256_add_ps(fjy1,ty);
1884             fjz1             = _mm256_add_ps(fjz1,tz);
1885
1886             }
1887
1888             /**************************
1889              * CALCULATE INTERACTIONS *
1890              **************************/
1891
1892             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1893             {
1894
1895             r12              = _mm256_mul_ps(rsq12,rinv12);
1896
1897             /* EWALD ELECTROSTATICS */
1898             
1899             /* Analytical PME correction */
1900             zeta2            = _mm256_mul_ps(beta2,rsq12);
1901             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1902             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1903             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1904             felec            = _mm256_mul_ps(qq12,felec);
1905             
1906             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1907
1908             fscal            = felec;
1909
1910             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1911
1912             /* Calculate temporary vectorial force */
1913             tx               = _mm256_mul_ps(fscal,dx12);
1914             ty               = _mm256_mul_ps(fscal,dy12);
1915             tz               = _mm256_mul_ps(fscal,dz12);
1916
1917             /* Update vectorial force */
1918             fix1             = _mm256_add_ps(fix1,tx);
1919             fiy1             = _mm256_add_ps(fiy1,ty);
1920             fiz1             = _mm256_add_ps(fiz1,tz);
1921
1922             fjx2             = _mm256_add_ps(fjx2,tx);
1923             fjy2             = _mm256_add_ps(fjy2,ty);
1924             fjz2             = _mm256_add_ps(fjz2,tz);
1925
1926             }
1927
1928             /**************************
1929              * CALCULATE INTERACTIONS *
1930              **************************/
1931
1932             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1933             {
1934
1935             r20              = _mm256_mul_ps(rsq20,rinv20);
1936
1937             /* EWALD ELECTROSTATICS */
1938             
1939             /* Analytical PME correction */
1940             zeta2            = _mm256_mul_ps(beta2,rsq20);
1941             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1942             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1943             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1944             felec            = _mm256_mul_ps(qq20,felec);
1945             
1946             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1947
1948             fscal            = felec;
1949
1950             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1951
1952             /* Calculate temporary vectorial force */
1953             tx               = _mm256_mul_ps(fscal,dx20);
1954             ty               = _mm256_mul_ps(fscal,dy20);
1955             tz               = _mm256_mul_ps(fscal,dz20);
1956
1957             /* Update vectorial force */
1958             fix2             = _mm256_add_ps(fix2,tx);
1959             fiy2             = _mm256_add_ps(fiy2,ty);
1960             fiz2             = _mm256_add_ps(fiz2,tz);
1961
1962             fjx0             = _mm256_add_ps(fjx0,tx);
1963             fjy0             = _mm256_add_ps(fjy0,ty);
1964             fjz0             = _mm256_add_ps(fjz0,tz);
1965
1966             }
1967
1968             /**************************
1969              * CALCULATE INTERACTIONS *
1970              **************************/
1971
1972             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1973             {
1974
1975             r21              = _mm256_mul_ps(rsq21,rinv21);
1976
1977             /* EWALD ELECTROSTATICS */
1978             
1979             /* Analytical PME correction */
1980             zeta2            = _mm256_mul_ps(beta2,rsq21);
1981             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1982             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1983             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1984             felec            = _mm256_mul_ps(qq21,felec);
1985             
1986             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1987
1988             fscal            = felec;
1989
1990             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1991
1992             /* Calculate temporary vectorial force */
1993             tx               = _mm256_mul_ps(fscal,dx21);
1994             ty               = _mm256_mul_ps(fscal,dy21);
1995             tz               = _mm256_mul_ps(fscal,dz21);
1996
1997             /* Update vectorial force */
1998             fix2             = _mm256_add_ps(fix2,tx);
1999             fiy2             = _mm256_add_ps(fiy2,ty);
2000             fiz2             = _mm256_add_ps(fiz2,tz);
2001
2002             fjx1             = _mm256_add_ps(fjx1,tx);
2003             fjy1             = _mm256_add_ps(fjy1,ty);
2004             fjz1             = _mm256_add_ps(fjz1,tz);
2005
2006             }
2007
2008             /**************************
2009              * CALCULATE INTERACTIONS *
2010              **************************/
2011
2012             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2013             {
2014
2015             r22              = _mm256_mul_ps(rsq22,rinv22);
2016
2017             /* EWALD ELECTROSTATICS */
2018             
2019             /* Analytical PME correction */
2020             zeta2            = _mm256_mul_ps(beta2,rsq22);
2021             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2022             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2023             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2024             felec            = _mm256_mul_ps(qq22,felec);
2025             
2026             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2027
2028             fscal            = felec;
2029
2030             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2031
2032             /* Calculate temporary vectorial force */
2033             tx               = _mm256_mul_ps(fscal,dx22);
2034             ty               = _mm256_mul_ps(fscal,dy22);
2035             tz               = _mm256_mul_ps(fscal,dz22);
2036
2037             /* Update vectorial force */
2038             fix2             = _mm256_add_ps(fix2,tx);
2039             fiy2             = _mm256_add_ps(fiy2,ty);
2040             fiz2             = _mm256_add_ps(fiz2,tz);
2041
2042             fjx2             = _mm256_add_ps(fjx2,tx);
2043             fjy2             = _mm256_add_ps(fjy2,ty);
2044             fjz2             = _mm256_add_ps(fjz2,tz);
2045
2046             }
2047
2048             fjptrA             = f+j_coord_offsetA;
2049             fjptrB             = f+j_coord_offsetB;
2050             fjptrC             = f+j_coord_offsetC;
2051             fjptrD             = f+j_coord_offsetD;
2052             fjptrE             = f+j_coord_offsetE;
2053             fjptrF             = f+j_coord_offsetF;
2054             fjptrG             = f+j_coord_offsetG;
2055             fjptrH             = f+j_coord_offsetH;
2056
2057             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2058                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2059
2060             /* Inner loop uses 538 flops */
2061         }
2062
2063         if(jidx<j_index_end)
2064         {
2065
2066             /* Get j neighbor index, and coordinate index */
2067             jnrlistA         = jjnr[jidx];
2068             jnrlistB         = jjnr[jidx+1];
2069             jnrlistC         = jjnr[jidx+2];
2070             jnrlistD         = jjnr[jidx+3];
2071             jnrlistE         = jjnr[jidx+4];
2072             jnrlistF         = jjnr[jidx+5];
2073             jnrlistG         = jjnr[jidx+6];
2074             jnrlistH         = jjnr[jidx+7];
2075             /* Sign of each element will be negative for non-real atoms.
2076              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2077              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2078              */
2079             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2080                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2081                                             
2082             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2083             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2084             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2085             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2086             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2087             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2088             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2089             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2090             j_coord_offsetA  = DIM*jnrA;
2091             j_coord_offsetB  = DIM*jnrB;
2092             j_coord_offsetC  = DIM*jnrC;
2093             j_coord_offsetD  = DIM*jnrD;
2094             j_coord_offsetE  = DIM*jnrE;
2095             j_coord_offsetF  = DIM*jnrF;
2096             j_coord_offsetG  = DIM*jnrG;
2097             j_coord_offsetH  = DIM*jnrH;
2098
2099             /* load j atom coordinates */
2100             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2101                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2102                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2103                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2104                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2105
2106             /* Calculate displacement vector */
2107             dx00             = _mm256_sub_ps(ix0,jx0);
2108             dy00             = _mm256_sub_ps(iy0,jy0);
2109             dz00             = _mm256_sub_ps(iz0,jz0);
2110             dx01             = _mm256_sub_ps(ix0,jx1);
2111             dy01             = _mm256_sub_ps(iy0,jy1);
2112             dz01             = _mm256_sub_ps(iz0,jz1);
2113             dx02             = _mm256_sub_ps(ix0,jx2);
2114             dy02             = _mm256_sub_ps(iy0,jy2);
2115             dz02             = _mm256_sub_ps(iz0,jz2);
2116             dx10             = _mm256_sub_ps(ix1,jx0);
2117             dy10             = _mm256_sub_ps(iy1,jy0);
2118             dz10             = _mm256_sub_ps(iz1,jz0);
2119             dx11             = _mm256_sub_ps(ix1,jx1);
2120             dy11             = _mm256_sub_ps(iy1,jy1);
2121             dz11             = _mm256_sub_ps(iz1,jz1);
2122             dx12             = _mm256_sub_ps(ix1,jx2);
2123             dy12             = _mm256_sub_ps(iy1,jy2);
2124             dz12             = _mm256_sub_ps(iz1,jz2);
2125             dx20             = _mm256_sub_ps(ix2,jx0);
2126             dy20             = _mm256_sub_ps(iy2,jy0);
2127             dz20             = _mm256_sub_ps(iz2,jz0);
2128             dx21             = _mm256_sub_ps(ix2,jx1);
2129             dy21             = _mm256_sub_ps(iy2,jy1);
2130             dz21             = _mm256_sub_ps(iz2,jz1);
2131             dx22             = _mm256_sub_ps(ix2,jx2);
2132             dy22             = _mm256_sub_ps(iy2,jy2);
2133             dz22             = _mm256_sub_ps(iz2,jz2);
2134
2135             /* Calculate squared distance and things based on it */
2136             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2137             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2138             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2139             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2140             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2141             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2142             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2143             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2144             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2145
2146             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2147             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
2148             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
2149             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
2150             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2151             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2152             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
2153             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2154             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2155
2156             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
2157             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
2158             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
2159             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
2160             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2161             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2162             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
2163             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2164             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2165
2166             fjx0             = _mm256_setzero_ps();
2167             fjy0             = _mm256_setzero_ps();
2168             fjz0             = _mm256_setzero_ps();
2169             fjx1             = _mm256_setzero_ps();
2170             fjy1             = _mm256_setzero_ps();
2171             fjz1             = _mm256_setzero_ps();
2172             fjx2             = _mm256_setzero_ps();
2173             fjy2             = _mm256_setzero_ps();
2174             fjz2             = _mm256_setzero_ps();
2175
2176             /**************************
2177              * CALCULATE INTERACTIONS *
2178              **************************/
2179
2180             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2181             {
2182
2183             r00              = _mm256_mul_ps(rsq00,rinv00);
2184             r00              = _mm256_andnot_ps(dummy_mask,r00);
2185
2186             /* EWALD ELECTROSTATICS */
2187             
2188             /* Analytical PME correction */
2189             zeta2            = _mm256_mul_ps(beta2,rsq00);
2190             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
2191             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2192             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2193             felec            = _mm256_mul_ps(qq00,felec);
2194             
2195             /* LENNARD-JONES DISPERSION/REPULSION */
2196
2197             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2198             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2199
2200             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2201
2202             fscal            = _mm256_add_ps(felec,fvdw);
2203
2204             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2205
2206             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2207
2208             /* Calculate temporary vectorial force */
2209             tx               = _mm256_mul_ps(fscal,dx00);
2210             ty               = _mm256_mul_ps(fscal,dy00);
2211             tz               = _mm256_mul_ps(fscal,dz00);
2212
2213             /* Update vectorial force */
2214             fix0             = _mm256_add_ps(fix0,tx);
2215             fiy0             = _mm256_add_ps(fiy0,ty);
2216             fiz0             = _mm256_add_ps(fiz0,tz);
2217
2218             fjx0             = _mm256_add_ps(fjx0,tx);
2219             fjy0             = _mm256_add_ps(fjy0,ty);
2220             fjz0             = _mm256_add_ps(fjz0,tz);
2221
2222             }
2223
2224             /**************************
2225              * CALCULATE INTERACTIONS *
2226              **************************/
2227
2228             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2229             {
2230
2231             r01              = _mm256_mul_ps(rsq01,rinv01);
2232             r01              = _mm256_andnot_ps(dummy_mask,r01);
2233
2234             /* EWALD ELECTROSTATICS */
2235             
2236             /* Analytical PME correction */
2237             zeta2            = _mm256_mul_ps(beta2,rsq01);
2238             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
2239             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2240             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2241             felec            = _mm256_mul_ps(qq01,felec);
2242             
2243             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2244
2245             fscal            = felec;
2246
2247             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2248
2249             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2250
2251             /* Calculate temporary vectorial force */
2252             tx               = _mm256_mul_ps(fscal,dx01);
2253             ty               = _mm256_mul_ps(fscal,dy01);
2254             tz               = _mm256_mul_ps(fscal,dz01);
2255
2256             /* Update vectorial force */
2257             fix0             = _mm256_add_ps(fix0,tx);
2258             fiy0             = _mm256_add_ps(fiy0,ty);
2259             fiz0             = _mm256_add_ps(fiz0,tz);
2260
2261             fjx1             = _mm256_add_ps(fjx1,tx);
2262             fjy1             = _mm256_add_ps(fjy1,ty);
2263             fjz1             = _mm256_add_ps(fjz1,tz);
2264
2265             }
2266
2267             /**************************
2268              * CALCULATE INTERACTIONS *
2269              **************************/
2270
2271             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2272             {
2273
2274             r02              = _mm256_mul_ps(rsq02,rinv02);
2275             r02              = _mm256_andnot_ps(dummy_mask,r02);
2276
2277             /* EWALD ELECTROSTATICS */
2278             
2279             /* Analytical PME correction */
2280             zeta2            = _mm256_mul_ps(beta2,rsq02);
2281             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
2282             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2283             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2284             felec            = _mm256_mul_ps(qq02,felec);
2285             
2286             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2287
2288             fscal            = felec;
2289
2290             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2291
2292             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2293
2294             /* Calculate temporary vectorial force */
2295             tx               = _mm256_mul_ps(fscal,dx02);
2296             ty               = _mm256_mul_ps(fscal,dy02);
2297             tz               = _mm256_mul_ps(fscal,dz02);
2298
2299             /* Update vectorial force */
2300             fix0             = _mm256_add_ps(fix0,tx);
2301             fiy0             = _mm256_add_ps(fiy0,ty);
2302             fiz0             = _mm256_add_ps(fiz0,tz);
2303
2304             fjx2             = _mm256_add_ps(fjx2,tx);
2305             fjy2             = _mm256_add_ps(fjy2,ty);
2306             fjz2             = _mm256_add_ps(fjz2,tz);
2307
2308             }
2309
2310             /**************************
2311              * CALCULATE INTERACTIONS *
2312              **************************/
2313
2314             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2315             {
2316
2317             r10              = _mm256_mul_ps(rsq10,rinv10);
2318             r10              = _mm256_andnot_ps(dummy_mask,r10);
2319
2320             /* EWALD ELECTROSTATICS */
2321             
2322             /* Analytical PME correction */
2323             zeta2            = _mm256_mul_ps(beta2,rsq10);
2324             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2325             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2326             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2327             felec            = _mm256_mul_ps(qq10,felec);
2328             
2329             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2330
2331             fscal            = felec;
2332
2333             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2334
2335             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2336
2337             /* Calculate temporary vectorial force */
2338             tx               = _mm256_mul_ps(fscal,dx10);
2339             ty               = _mm256_mul_ps(fscal,dy10);
2340             tz               = _mm256_mul_ps(fscal,dz10);
2341
2342             /* Update vectorial force */
2343             fix1             = _mm256_add_ps(fix1,tx);
2344             fiy1             = _mm256_add_ps(fiy1,ty);
2345             fiz1             = _mm256_add_ps(fiz1,tz);
2346
2347             fjx0             = _mm256_add_ps(fjx0,tx);
2348             fjy0             = _mm256_add_ps(fjy0,ty);
2349             fjz0             = _mm256_add_ps(fjz0,tz);
2350
2351             }
2352
2353             /**************************
2354              * CALCULATE INTERACTIONS *
2355              **************************/
2356
2357             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2358             {
2359
2360             r11              = _mm256_mul_ps(rsq11,rinv11);
2361             r11              = _mm256_andnot_ps(dummy_mask,r11);
2362
2363             /* EWALD ELECTROSTATICS */
2364             
2365             /* Analytical PME correction */
2366             zeta2            = _mm256_mul_ps(beta2,rsq11);
2367             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2368             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2369             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2370             felec            = _mm256_mul_ps(qq11,felec);
2371             
2372             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2373
2374             fscal            = felec;
2375
2376             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2377
2378             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2379
2380             /* Calculate temporary vectorial force */
2381             tx               = _mm256_mul_ps(fscal,dx11);
2382             ty               = _mm256_mul_ps(fscal,dy11);
2383             tz               = _mm256_mul_ps(fscal,dz11);
2384
2385             /* Update vectorial force */
2386             fix1             = _mm256_add_ps(fix1,tx);
2387             fiy1             = _mm256_add_ps(fiy1,ty);
2388             fiz1             = _mm256_add_ps(fiz1,tz);
2389
2390             fjx1             = _mm256_add_ps(fjx1,tx);
2391             fjy1             = _mm256_add_ps(fjy1,ty);
2392             fjz1             = _mm256_add_ps(fjz1,tz);
2393
2394             }
2395
2396             /**************************
2397              * CALCULATE INTERACTIONS *
2398              **************************/
2399
2400             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2401             {
2402
2403             r12              = _mm256_mul_ps(rsq12,rinv12);
2404             r12              = _mm256_andnot_ps(dummy_mask,r12);
2405
2406             /* EWALD ELECTROSTATICS */
2407             
2408             /* Analytical PME correction */
2409             zeta2            = _mm256_mul_ps(beta2,rsq12);
2410             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2411             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2412             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2413             felec            = _mm256_mul_ps(qq12,felec);
2414             
2415             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2416
2417             fscal            = felec;
2418
2419             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2420
2421             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2422
2423             /* Calculate temporary vectorial force */
2424             tx               = _mm256_mul_ps(fscal,dx12);
2425             ty               = _mm256_mul_ps(fscal,dy12);
2426             tz               = _mm256_mul_ps(fscal,dz12);
2427
2428             /* Update vectorial force */
2429             fix1             = _mm256_add_ps(fix1,tx);
2430             fiy1             = _mm256_add_ps(fiy1,ty);
2431             fiz1             = _mm256_add_ps(fiz1,tz);
2432
2433             fjx2             = _mm256_add_ps(fjx2,tx);
2434             fjy2             = _mm256_add_ps(fjy2,ty);
2435             fjz2             = _mm256_add_ps(fjz2,tz);
2436
2437             }
2438
2439             /**************************
2440              * CALCULATE INTERACTIONS *
2441              **************************/
2442
2443             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2444             {
2445
2446             r20              = _mm256_mul_ps(rsq20,rinv20);
2447             r20              = _mm256_andnot_ps(dummy_mask,r20);
2448
2449             /* EWALD ELECTROSTATICS */
2450             
2451             /* Analytical PME correction */
2452             zeta2            = _mm256_mul_ps(beta2,rsq20);
2453             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2454             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2455             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2456             felec            = _mm256_mul_ps(qq20,felec);
2457             
2458             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2459
2460             fscal            = felec;
2461
2462             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2463
2464             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2465
2466             /* Calculate temporary vectorial force */
2467             tx               = _mm256_mul_ps(fscal,dx20);
2468             ty               = _mm256_mul_ps(fscal,dy20);
2469             tz               = _mm256_mul_ps(fscal,dz20);
2470
2471             /* Update vectorial force */
2472             fix2             = _mm256_add_ps(fix2,tx);
2473             fiy2             = _mm256_add_ps(fiy2,ty);
2474             fiz2             = _mm256_add_ps(fiz2,tz);
2475
2476             fjx0             = _mm256_add_ps(fjx0,tx);
2477             fjy0             = _mm256_add_ps(fjy0,ty);
2478             fjz0             = _mm256_add_ps(fjz0,tz);
2479
2480             }
2481
2482             /**************************
2483              * CALCULATE INTERACTIONS *
2484              **************************/
2485
2486             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2487             {
2488
2489             r21              = _mm256_mul_ps(rsq21,rinv21);
2490             r21              = _mm256_andnot_ps(dummy_mask,r21);
2491
2492             /* EWALD ELECTROSTATICS */
2493             
2494             /* Analytical PME correction */
2495             zeta2            = _mm256_mul_ps(beta2,rsq21);
2496             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2497             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2498             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2499             felec            = _mm256_mul_ps(qq21,felec);
2500             
2501             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2502
2503             fscal            = felec;
2504
2505             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2506
2507             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2508
2509             /* Calculate temporary vectorial force */
2510             tx               = _mm256_mul_ps(fscal,dx21);
2511             ty               = _mm256_mul_ps(fscal,dy21);
2512             tz               = _mm256_mul_ps(fscal,dz21);
2513
2514             /* Update vectorial force */
2515             fix2             = _mm256_add_ps(fix2,tx);
2516             fiy2             = _mm256_add_ps(fiy2,ty);
2517             fiz2             = _mm256_add_ps(fiz2,tz);
2518
2519             fjx1             = _mm256_add_ps(fjx1,tx);
2520             fjy1             = _mm256_add_ps(fjy1,ty);
2521             fjz1             = _mm256_add_ps(fjz1,tz);
2522
2523             }
2524
2525             /**************************
2526              * CALCULATE INTERACTIONS *
2527              **************************/
2528
2529             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2530             {
2531
2532             r22              = _mm256_mul_ps(rsq22,rinv22);
2533             r22              = _mm256_andnot_ps(dummy_mask,r22);
2534
2535             /* EWALD ELECTROSTATICS */
2536             
2537             /* Analytical PME correction */
2538             zeta2            = _mm256_mul_ps(beta2,rsq22);
2539             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2540             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2541             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2542             felec            = _mm256_mul_ps(qq22,felec);
2543             
2544             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2545
2546             fscal            = felec;
2547
2548             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2549
2550             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2551
2552             /* Calculate temporary vectorial force */
2553             tx               = _mm256_mul_ps(fscal,dx22);
2554             ty               = _mm256_mul_ps(fscal,dy22);
2555             tz               = _mm256_mul_ps(fscal,dz22);
2556
2557             /* Update vectorial force */
2558             fix2             = _mm256_add_ps(fix2,tx);
2559             fiy2             = _mm256_add_ps(fiy2,ty);
2560             fiz2             = _mm256_add_ps(fiz2,tz);
2561
2562             fjx2             = _mm256_add_ps(fjx2,tx);
2563             fjy2             = _mm256_add_ps(fjy2,ty);
2564             fjz2             = _mm256_add_ps(fjz2,tz);
2565
2566             }
2567
2568             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2569             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2570             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2571             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2572             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2573             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2574             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2575             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2576
2577             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2578                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2579
2580             /* Inner loop uses 547 flops */
2581         }
2582
2583         /* End of innermost loop */
2584
2585         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2586                                                  f+i_coord_offset,fshift+i_shift_offset);
2587
2588         /* Increment number of inner iterations */
2589         inneriter                  += j_index_end - j_index_start;
2590
2591         /* Outer loop uses 18 flops */
2592     }
2593
2594     /* Increment number of outer iterations */
2595     outeriter        += nri;
2596
2597     /* Update outer/inner flops */
2598
2599     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*547);
2600 }