Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_avx_256_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_single
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_VF_avx_256_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrE,jnrF,jnrG,jnrH;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
83     real             scratch[4*DIM];
84     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     real *           vdwioffsetptr0;
86     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     real *           vdwioffsetptr1;
88     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     real *           vdwioffsetptr2;
90     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
92     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
94     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
96     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
113     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
114     __m256i          ewitab;
115     __m128i          ewitab_lo,ewitab_hi;
116     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
117     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
118     real             *ewtab;
119     __m256           dummy_mask,cutoff_mask;
120     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
121     __m256           one     = _mm256_set1_ps(1.0);
122     __m256           two     = _mm256_set1_ps(2.0);
123     x                = xx[0];
124     f                = ff[0];
125
126     nri              = nlist->nri;
127     iinr             = nlist->iinr;
128     jindex           = nlist->jindex;
129     jjnr             = nlist->jjnr;
130     shiftidx         = nlist->shift;
131     gid              = nlist->gid;
132     shiftvec         = fr->shift_vec[0];
133     fshift           = fr->fshift[0];
134     facel            = _mm256_set1_ps(fr->ic->epsfac);
135     charge           = mdatoms->chargeA;
136     nvdwtype         = fr->ntype;
137     vdwparam         = fr->nbfp;
138     vdwtype          = mdatoms->typeA;
139
140     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
141     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
142     beta2            = _mm256_mul_ps(beta,beta);
143     beta3            = _mm256_mul_ps(beta,beta2);
144
145     ewtab            = fr->ic->tabq_coul_FDV0;
146     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
147     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
148
149     /* Setup water-specific parameters */
150     inr              = nlist->iinr[0];
151     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
152     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
153     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
154     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
155
156     jq0              = _mm256_set1_ps(charge[inr+0]);
157     jq1              = _mm256_set1_ps(charge[inr+1]);
158     jq2              = _mm256_set1_ps(charge[inr+2]);
159     vdwjidx0A        = 2*vdwtype[inr+0];
160     qq00             = _mm256_mul_ps(iq0,jq0);
161     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
162     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
163     qq01             = _mm256_mul_ps(iq0,jq1);
164     qq02             = _mm256_mul_ps(iq0,jq2);
165     qq10             = _mm256_mul_ps(iq1,jq0);
166     qq11             = _mm256_mul_ps(iq1,jq1);
167     qq12             = _mm256_mul_ps(iq1,jq2);
168     qq20             = _mm256_mul_ps(iq2,jq0);
169     qq21             = _mm256_mul_ps(iq2,jq1);
170     qq22             = _mm256_mul_ps(iq2,jq2);
171
172     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
173     rcutoff_scalar   = fr->ic->rcoulomb;
174     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
175     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
176
177     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
178     rvdw             = _mm256_set1_ps(fr->ic->rvdw);
179
180     /* Avoid stupid compiler warnings */
181     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
182     j_coord_offsetA = 0;
183     j_coord_offsetB = 0;
184     j_coord_offsetC = 0;
185     j_coord_offsetD = 0;
186     j_coord_offsetE = 0;
187     j_coord_offsetF = 0;
188     j_coord_offsetG = 0;
189     j_coord_offsetH = 0;
190
191     outeriter        = 0;
192     inneriter        = 0;
193
194     for(iidx=0;iidx<4*DIM;iidx++)
195     {
196         scratch[iidx] = 0.0;
197     }
198
199     /* Start outer loop over neighborlists */
200     for(iidx=0; iidx<nri; iidx++)
201     {
202         /* Load shift vector for this list */
203         i_shift_offset   = DIM*shiftidx[iidx];
204
205         /* Load limits for loop over neighbors */
206         j_index_start    = jindex[iidx];
207         j_index_end      = jindex[iidx+1];
208
209         /* Get outer coordinate index */
210         inr              = iinr[iidx];
211         i_coord_offset   = DIM*inr;
212
213         /* Load i particle coords and add shift vector */
214         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
215                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
216
217         fix0             = _mm256_setzero_ps();
218         fiy0             = _mm256_setzero_ps();
219         fiz0             = _mm256_setzero_ps();
220         fix1             = _mm256_setzero_ps();
221         fiy1             = _mm256_setzero_ps();
222         fiz1             = _mm256_setzero_ps();
223         fix2             = _mm256_setzero_ps();
224         fiy2             = _mm256_setzero_ps();
225         fiz2             = _mm256_setzero_ps();
226
227         /* Reset potential sums */
228         velecsum         = _mm256_setzero_ps();
229         vvdwsum          = _mm256_setzero_ps();
230
231         /* Start inner kernel loop */
232         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
233         {
234
235             /* Get j neighbor index, and coordinate index */
236             jnrA             = jjnr[jidx];
237             jnrB             = jjnr[jidx+1];
238             jnrC             = jjnr[jidx+2];
239             jnrD             = jjnr[jidx+3];
240             jnrE             = jjnr[jidx+4];
241             jnrF             = jjnr[jidx+5];
242             jnrG             = jjnr[jidx+6];
243             jnrH             = jjnr[jidx+7];
244             j_coord_offsetA  = DIM*jnrA;
245             j_coord_offsetB  = DIM*jnrB;
246             j_coord_offsetC  = DIM*jnrC;
247             j_coord_offsetD  = DIM*jnrD;
248             j_coord_offsetE  = DIM*jnrE;
249             j_coord_offsetF  = DIM*jnrF;
250             j_coord_offsetG  = DIM*jnrG;
251             j_coord_offsetH  = DIM*jnrH;
252
253             /* load j atom coordinates */
254             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
255                                                  x+j_coord_offsetC,x+j_coord_offsetD,
256                                                  x+j_coord_offsetE,x+j_coord_offsetF,
257                                                  x+j_coord_offsetG,x+j_coord_offsetH,
258                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
259
260             /* Calculate displacement vector */
261             dx00             = _mm256_sub_ps(ix0,jx0);
262             dy00             = _mm256_sub_ps(iy0,jy0);
263             dz00             = _mm256_sub_ps(iz0,jz0);
264             dx01             = _mm256_sub_ps(ix0,jx1);
265             dy01             = _mm256_sub_ps(iy0,jy1);
266             dz01             = _mm256_sub_ps(iz0,jz1);
267             dx02             = _mm256_sub_ps(ix0,jx2);
268             dy02             = _mm256_sub_ps(iy0,jy2);
269             dz02             = _mm256_sub_ps(iz0,jz2);
270             dx10             = _mm256_sub_ps(ix1,jx0);
271             dy10             = _mm256_sub_ps(iy1,jy0);
272             dz10             = _mm256_sub_ps(iz1,jz0);
273             dx11             = _mm256_sub_ps(ix1,jx1);
274             dy11             = _mm256_sub_ps(iy1,jy1);
275             dz11             = _mm256_sub_ps(iz1,jz1);
276             dx12             = _mm256_sub_ps(ix1,jx2);
277             dy12             = _mm256_sub_ps(iy1,jy2);
278             dz12             = _mm256_sub_ps(iz1,jz2);
279             dx20             = _mm256_sub_ps(ix2,jx0);
280             dy20             = _mm256_sub_ps(iy2,jy0);
281             dz20             = _mm256_sub_ps(iz2,jz0);
282             dx21             = _mm256_sub_ps(ix2,jx1);
283             dy21             = _mm256_sub_ps(iy2,jy1);
284             dz21             = _mm256_sub_ps(iz2,jz1);
285             dx22             = _mm256_sub_ps(ix2,jx2);
286             dy22             = _mm256_sub_ps(iy2,jy2);
287             dz22             = _mm256_sub_ps(iz2,jz2);
288
289             /* Calculate squared distance and things based on it */
290             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
291             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
292             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
293             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
294             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
295             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
296             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
297             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
298             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
299
300             rinv00           = avx256_invsqrt_f(rsq00);
301             rinv01           = avx256_invsqrt_f(rsq01);
302             rinv02           = avx256_invsqrt_f(rsq02);
303             rinv10           = avx256_invsqrt_f(rsq10);
304             rinv11           = avx256_invsqrt_f(rsq11);
305             rinv12           = avx256_invsqrt_f(rsq12);
306             rinv20           = avx256_invsqrt_f(rsq20);
307             rinv21           = avx256_invsqrt_f(rsq21);
308             rinv22           = avx256_invsqrt_f(rsq22);
309
310             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
311             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
312             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
313             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
314             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
315             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
316             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
317             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
318             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
319
320             fjx0             = _mm256_setzero_ps();
321             fjy0             = _mm256_setzero_ps();
322             fjz0             = _mm256_setzero_ps();
323             fjx1             = _mm256_setzero_ps();
324             fjy1             = _mm256_setzero_ps();
325             fjz1             = _mm256_setzero_ps();
326             fjx2             = _mm256_setzero_ps();
327             fjy2             = _mm256_setzero_ps();
328             fjz2             = _mm256_setzero_ps();
329
330             /**************************
331              * CALCULATE INTERACTIONS *
332              **************************/
333
334             if (gmx_mm256_any_lt(rsq00,rcutoff2))
335             {
336
337             r00              = _mm256_mul_ps(rsq00,rinv00);
338
339             /* EWALD ELECTROSTATICS */
340             
341             /* Analytical PME correction */
342             zeta2            = _mm256_mul_ps(beta2,rsq00);
343             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
344             pmecorrF         = avx256_pmecorrF_f(zeta2);
345             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
346             felec            = _mm256_mul_ps(qq00,felec);
347             pmecorrV         = avx256_pmecorrV_f(zeta2);
348             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
349             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
350             velec            = _mm256_mul_ps(qq00,velec);
351             
352             /* LENNARD-JONES DISPERSION/REPULSION */
353
354             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
355             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
356             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
357             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
358                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
359             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
360
361             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
362
363             /* Update potential sum for this i atom from the interaction with this j atom. */
364             velec            = _mm256_and_ps(velec,cutoff_mask);
365             velecsum         = _mm256_add_ps(velecsum,velec);
366             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
367             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
368
369             fscal            = _mm256_add_ps(felec,fvdw);
370
371             fscal            = _mm256_and_ps(fscal,cutoff_mask);
372
373             /* Calculate temporary vectorial force */
374             tx               = _mm256_mul_ps(fscal,dx00);
375             ty               = _mm256_mul_ps(fscal,dy00);
376             tz               = _mm256_mul_ps(fscal,dz00);
377
378             /* Update vectorial force */
379             fix0             = _mm256_add_ps(fix0,tx);
380             fiy0             = _mm256_add_ps(fiy0,ty);
381             fiz0             = _mm256_add_ps(fiz0,tz);
382
383             fjx0             = _mm256_add_ps(fjx0,tx);
384             fjy0             = _mm256_add_ps(fjy0,ty);
385             fjz0             = _mm256_add_ps(fjz0,tz);
386
387             }
388
389             /**************************
390              * CALCULATE INTERACTIONS *
391              **************************/
392
393             if (gmx_mm256_any_lt(rsq01,rcutoff2))
394             {
395
396             r01              = _mm256_mul_ps(rsq01,rinv01);
397
398             /* EWALD ELECTROSTATICS */
399             
400             /* Analytical PME correction */
401             zeta2            = _mm256_mul_ps(beta2,rsq01);
402             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
403             pmecorrF         = avx256_pmecorrF_f(zeta2);
404             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
405             felec            = _mm256_mul_ps(qq01,felec);
406             pmecorrV         = avx256_pmecorrV_f(zeta2);
407             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
408             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv01,sh_ewald),pmecorrV);
409             velec            = _mm256_mul_ps(qq01,velec);
410             
411             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
412
413             /* Update potential sum for this i atom from the interaction with this j atom. */
414             velec            = _mm256_and_ps(velec,cutoff_mask);
415             velecsum         = _mm256_add_ps(velecsum,velec);
416
417             fscal            = felec;
418
419             fscal            = _mm256_and_ps(fscal,cutoff_mask);
420
421             /* Calculate temporary vectorial force */
422             tx               = _mm256_mul_ps(fscal,dx01);
423             ty               = _mm256_mul_ps(fscal,dy01);
424             tz               = _mm256_mul_ps(fscal,dz01);
425
426             /* Update vectorial force */
427             fix0             = _mm256_add_ps(fix0,tx);
428             fiy0             = _mm256_add_ps(fiy0,ty);
429             fiz0             = _mm256_add_ps(fiz0,tz);
430
431             fjx1             = _mm256_add_ps(fjx1,tx);
432             fjy1             = _mm256_add_ps(fjy1,ty);
433             fjz1             = _mm256_add_ps(fjz1,tz);
434
435             }
436
437             /**************************
438              * CALCULATE INTERACTIONS *
439              **************************/
440
441             if (gmx_mm256_any_lt(rsq02,rcutoff2))
442             {
443
444             r02              = _mm256_mul_ps(rsq02,rinv02);
445
446             /* EWALD ELECTROSTATICS */
447             
448             /* Analytical PME correction */
449             zeta2            = _mm256_mul_ps(beta2,rsq02);
450             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
451             pmecorrF         = avx256_pmecorrF_f(zeta2);
452             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
453             felec            = _mm256_mul_ps(qq02,felec);
454             pmecorrV         = avx256_pmecorrV_f(zeta2);
455             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
456             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv02,sh_ewald),pmecorrV);
457             velec            = _mm256_mul_ps(qq02,velec);
458             
459             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
460
461             /* Update potential sum for this i atom from the interaction with this j atom. */
462             velec            = _mm256_and_ps(velec,cutoff_mask);
463             velecsum         = _mm256_add_ps(velecsum,velec);
464
465             fscal            = felec;
466
467             fscal            = _mm256_and_ps(fscal,cutoff_mask);
468
469             /* Calculate temporary vectorial force */
470             tx               = _mm256_mul_ps(fscal,dx02);
471             ty               = _mm256_mul_ps(fscal,dy02);
472             tz               = _mm256_mul_ps(fscal,dz02);
473
474             /* Update vectorial force */
475             fix0             = _mm256_add_ps(fix0,tx);
476             fiy0             = _mm256_add_ps(fiy0,ty);
477             fiz0             = _mm256_add_ps(fiz0,tz);
478
479             fjx2             = _mm256_add_ps(fjx2,tx);
480             fjy2             = _mm256_add_ps(fjy2,ty);
481             fjz2             = _mm256_add_ps(fjz2,tz);
482
483             }
484
485             /**************************
486              * CALCULATE INTERACTIONS *
487              **************************/
488
489             if (gmx_mm256_any_lt(rsq10,rcutoff2))
490             {
491
492             r10              = _mm256_mul_ps(rsq10,rinv10);
493
494             /* EWALD ELECTROSTATICS */
495             
496             /* Analytical PME correction */
497             zeta2            = _mm256_mul_ps(beta2,rsq10);
498             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
499             pmecorrF         = avx256_pmecorrF_f(zeta2);
500             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
501             felec            = _mm256_mul_ps(qq10,felec);
502             pmecorrV         = avx256_pmecorrV_f(zeta2);
503             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
504             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
505             velec            = _mm256_mul_ps(qq10,velec);
506             
507             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
508
509             /* Update potential sum for this i atom from the interaction with this j atom. */
510             velec            = _mm256_and_ps(velec,cutoff_mask);
511             velecsum         = _mm256_add_ps(velecsum,velec);
512
513             fscal            = felec;
514
515             fscal            = _mm256_and_ps(fscal,cutoff_mask);
516
517             /* Calculate temporary vectorial force */
518             tx               = _mm256_mul_ps(fscal,dx10);
519             ty               = _mm256_mul_ps(fscal,dy10);
520             tz               = _mm256_mul_ps(fscal,dz10);
521
522             /* Update vectorial force */
523             fix1             = _mm256_add_ps(fix1,tx);
524             fiy1             = _mm256_add_ps(fiy1,ty);
525             fiz1             = _mm256_add_ps(fiz1,tz);
526
527             fjx0             = _mm256_add_ps(fjx0,tx);
528             fjy0             = _mm256_add_ps(fjy0,ty);
529             fjz0             = _mm256_add_ps(fjz0,tz);
530
531             }
532
533             /**************************
534              * CALCULATE INTERACTIONS *
535              **************************/
536
537             if (gmx_mm256_any_lt(rsq11,rcutoff2))
538             {
539
540             r11              = _mm256_mul_ps(rsq11,rinv11);
541
542             /* EWALD ELECTROSTATICS */
543             
544             /* Analytical PME correction */
545             zeta2            = _mm256_mul_ps(beta2,rsq11);
546             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
547             pmecorrF         = avx256_pmecorrF_f(zeta2);
548             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
549             felec            = _mm256_mul_ps(qq11,felec);
550             pmecorrV         = avx256_pmecorrV_f(zeta2);
551             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
552             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
553             velec            = _mm256_mul_ps(qq11,velec);
554             
555             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
556
557             /* Update potential sum for this i atom from the interaction with this j atom. */
558             velec            = _mm256_and_ps(velec,cutoff_mask);
559             velecsum         = _mm256_add_ps(velecsum,velec);
560
561             fscal            = felec;
562
563             fscal            = _mm256_and_ps(fscal,cutoff_mask);
564
565             /* Calculate temporary vectorial force */
566             tx               = _mm256_mul_ps(fscal,dx11);
567             ty               = _mm256_mul_ps(fscal,dy11);
568             tz               = _mm256_mul_ps(fscal,dz11);
569
570             /* Update vectorial force */
571             fix1             = _mm256_add_ps(fix1,tx);
572             fiy1             = _mm256_add_ps(fiy1,ty);
573             fiz1             = _mm256_add_ps(fiz1,tz);
574
575             fjx1             = _mm256_add_ps(fjx1,tx);
576             fjy1             = _mm256_add_ps(fjy1,ty);
577             fjz1             = _mm256_add_ps(fjz1,tz);
578
579             }
580
581             /**************************
582              * CALCULATE INTERACTIONS *
583              **************************/
584
585             if (gmx_mm256_any_lt(rsq12,rcutoff2))
586             {
587
588             r12              = _mm256_mul_ps(rsq12,rinv12);
589
590             /* EWALD ELECTROSTATICS */
591             
592             /* Analytical PME correction */
593             zeta2            = _mm256_mul_ps(beta2,rsq12);
594             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
595             pmecorrF         = avx256_pmecorrF_f(zeta2);
596             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
597             felec            = _mm256_mul_ps(qq12,felec);
598             pmecorrV         = avx256_pmecorrV_f(zeta2);
599             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
600             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
601             velec            = _mm256_mul_ps(qq12,velec);
602             
603             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
604
605             /* Update potential sum for this i atom from the interaction with this j atom. */
606             velec            = _mm256_and_ps(velec,cutoff_mask);
607             velecsum         = _mm256_add_ps(velecsum,velec);
608
609             fscal            = felec;
610
611             fscal            = _mm256_and_ps(fscal,cutoff_mask);
612
613             /* Calculate temporary vectorial force */
614             tx               = _mm256_mul_ps(fscal,dx12);
615             ty               = _mm256_mul_ps(fscal,dy12);
616             tz               = _mm256_mul_ps(fscal,dz12);
617
618             /* Update vectorial force */
619             fix1             = _mm256_add_ps(fix1,tx);
620             fiy1             = _mm256_add_ps(fiy1,ty);
621             fiz1             = _mm256_add_ps(fiz1,tz);
622
623             fjx2             = _mm256_add_ps(fjx2,tx);
624             fjy2             = _mm256_add_ps(fjy2,ty);
625             fjz2             = _mm256_add_ps(fjz2,tz);
626
627             }
628
629             /**************************
630              * CALCULATE INTERACTIONS *
631              **************************/
632
633             if (gmx_mm256_any_lt(rsq20,rcutoff2))
634             {
635
636             r20              = _mm256_mul_ps(rsq20,rinv20);
637
638             /* EWALD ELECTROSTATICS */
639             
640             /* Analytical PME correction */
641             zeta2            = _mm256_mul_ps(beta2,rsq20);
642             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
643             pmecorrF         = avx256_pmecorrF_f(zeta2);
644             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
645             felec            = _mm256_mul_ps(qq20,felec);
646             pmecorrV         = avx256_pmecorrV_f(zeta2);
647             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
648             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
649             velec            = _mm256_mul_ps(qq20,velec);
650             
651             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
652
653             /* Update potential sum for this i atom from the interaction with this j atom. */
654             velec            = _mm256_and_ps(velec,cutoff_mask);
655             velecsum         = _mm256_add_ps(velecsum,velec);
656
657             fscal            = felec;
658
659             fscal            = _mm256_and_ps(fscal,cutoff_mask);
660
661             /* Calculate temporary vectorial force */
662             tx               = _mm256_mul_ps(fscal,dx20);
663             ty               = _mm256_mul_ps(fscal,dy20);
664             tz               = _mm256_mul_ps(fscal,dz20);
665
666             /* Update vectorial force */
667             fix2             = _mm256_add_ps(fix2,tx);
668             fiy2             = _mm256_add_ps(fiy2,ty);
669             fiz2             = _mm256_add_ps(fiz2,tz);
670
671             fjx0             = _mm256_add_ps(fjx0,tx);
672             fjy0             = _mm256_add_ps(fjy0,ty);
673             fjz0             = _mm256_add_ps(fjz0,tz);
674
675             }
676
677             /**************************
678              * CALCULATE INTERACTIONS *
679              **************************/
680
681             if (gmx_mm256_any_lt(rsq21,rcutoff2))
682             {
683
684             r21              = _mm256_mul_ps(rsq21,rinv21);
685
686             /* EWALD ELECTROSTATICS */
687             
688             /* Analytical PME correction */
689             zeta2            = _mm256_mul_ps(beta2,rsq21);
690             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
691             pmecorrF         = avx256_pmecorrF_f(zeta2);
692             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
693             felec            = _mm256_mul_ps(qq21,felec);
694             pmecorrV         = avx256_pmecorrV_f(zeta2);
695             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
696             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
697             velec            = _mm256_mul_ps(qq21,velec);
698             
699             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
700
701             /* Update potential sum for this i atom from the interaction with this j atom. */
702             velec            = _mm256_and_ps(velec,cutoff_mask);
703             velecsum         = _mm256_add_ps(velecsum,velec);
704
705             fscal            = felec;
706
707             fscal            = _mm256_and_ps(fscal,cutoff_mask);
708
709             /* Calculate temporary vectorial force */
710             tx               = _mm256_mul_ps(fscal,dx21);
711             ty               = _mm256_mul_ps(fscal,dy21);
712             tz               = _mm256_mul_ps(fscal,dz21);
713
714             /* Update vectorial force */
715             fix2             = _mm256_add_ps(fix2,tx);
716             fiy2             = _mm256_add_ps(fiy2,ty);
717             fiz2             = _mm256_add_ps(fiz2,tz);
718
719             fjx1             = _mm256_add_ps(fjx1,tx);
720             fjy1             = _mm256_add_ps(fjy1,ty);
721             fjz1             = _mm256_add_ps(fjz1,tz);
722
723             }
724
725             /**************************
726              * CALCULATE INTERACTIONS *
727              **************************/
728
729             if (gmx_mm256_any_lt(rsq22,rcutoff2))
730             {
731
732             r22              = _mm256_mul_ps(rsq22,rinv22);
733
734             /* EWALD ELECTROSTATICS */
735             
736             /* Analytical PME correction */
737             zeta2            = _mm256_mul_ps(beta2,rsq22);
738             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
739             pmecorrF         = avx256_pmecorrF_f(zeta2);
740             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
741             felec            = _mm256_mul_ps(qq22,felec);
742             pmecorrV         = avx256_pmecorrV_f(zeta2);
743             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
744             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
745             velec            = _mm256_mul_ps(qq22,velec);
746             
747             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
748
749             /* Update potential sum for this i atom from the interaction with this j atom. */
750             velec            = _mm256_and_ps(velec,cutoff_mask);
751             velecsum         = _mm256_add_ps(velecsum,velec);
752
753             fscal            = felec;
754
755             fscal            = _mm256_and_ps(fscal,cutoff_mask);
756
757             /* Calculate temporary vectorial force */
758             tx               = _mm256_mul_ps(fscal,dx22);
759             ty               = _mm256_mul_ps(fscal,dy22);
760             tz               = _mm256_mul_ps(fscal,dz22);
761
762             /* Update vectorial force */
763             fix2             = _mm256_add_ps(fix2,tx);
764             fiy2             = _mm256_add_ps(fiy2,ty);
765             fiz2             = _mm256_add_ps(fiz2,tz);
766
767             fjx2             = _mm256_add_ps(fjx2,tx);
768             fjy2             = _mm256_add_ps(fjy2,ty);
769             fjz2             = _mm256_add_ps(fjz2,tz);
770
771             }
772
773             fjptrA             = f+j_coord_offsetA;
774             fjptrB             = f+j_coord_offsetB;
775             fjptrC             = f+j_coord_offsetC;
776             fjptrD             = f+j_coord_offsetD;
777             fjptrE             = f+j_coord_offsetE;
778             fjptrF             = f+j_coord_offsetF;
779             fjptrG             = f+j_coord_offsetG;
780             fjptrH             = f+j_coord_offsetH;
781
782             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
783                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
784
785             /* Inner loop uses 999 flops */
786         }
787
788         if(jidx<j_index_end)
789         {
790
791             /* Get j neighbor index, and coordinate index */
792             jnrlistA         = jjnr[jidx];
793             jnrlistB         = jjnr[jidx+1];
794             jnrlistC         = jjnr[jidx+2];
795             jnrlistD         = jjnr[jidx+3];
796             jnrlistE         = jjnr[jidx+4];
797             jnrlistF         = jjnr[jidx+5];
798             jnrlistG         = jjnr[jidx+6];
799             jnrlistH         = jjnr[jidx+7];
800             /* Sign of each element will be negative for non-real atoms.
801              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
802              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
803              */
804             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
805                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
806                                             
807             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
808             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
809             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
810             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
811             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
812             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
813             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
814             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
815             j_coord_offsetA  = DIM*jnrA;
816             j_coord_offsetB  = DIM*jnrB;
817             j_coord_offsetC  = DIM*jnrC;
818             j_coord_offsetD  = DIM*jnrD;
819             j_coord_offsetE  = DIM*jnrE;
820             j_coord_offsetF  = DIM*jnrF;
821             j_coord_offsetG  = DIM*jnrG;
822             j_coord_offsetH  = DIM*jnrH;
823
824             /* load j atom coordinates */
825             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
826                                                  x+j_coord_offsetC,x+j_coord_offsetD,
827                                                  x+j_coord_offsetE,x+j_coord_offsetF,
828                                                  x+j_coord_offsetG,x+j_coord_offsetH,
829                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
830
831             /* Calculate displacement vector */
832             dx00             = _mm256_sub_ps(ix0,jx0);
833             dy00             = _mm256_sub_ps(iy0,jy0);
834             dz00             = _mm256_sub_ps(iz0,jz0);
835             dx01             = _mm256_sub_ps(ix0,jx1);
836             dy01             = _mm256_sub_ps(iy0,jy1);
837             dz01             = _mm256_sub_ps(iz0,jz1);
838             dx02             = _mm256_sub_ps(ix0,jx2);
839             dy02             = _mm256_sub_ps(iy0,jy2);
840             dz02             = _mm256_sub_ps(iz0,jz2);
841             dx10             = _mm256_sub_ps(ix1,jx0);
842             dy10             = _mm256_sub_ps(iy1,jy0);
843             dz10             = _mm256_sub_ps(iz1,jz0);
844             dx11             = _mm256_sub_ps(ix1,jx1);
845             dy11             = _mm256_sub_ps(iy1,jy1);
846             dz11             = _mm256_sub_ps(iz1,jz1);
847             dx12             = _mm256_sub_ps(ix1,jx2);
848             dy12             = _mm256_sub_ps(iy1,jy2);
849             dz12             = _mm256_sub_ps(iz1,jz2);
850             dx20             = _mm256_sub_ps(ix2,jx0);
851             dy20             = _mm256_sub_ps(iy2,jy0);
852             dz20             = _mm256_sub_ps(iz2,jz0);
853             dx21             = _mm256_sub_ps(ix2,jx1);
854             dy21             = _mm256_sub_ps(iy2,jy1);
855             dz21             = _mm256_sub_ps(iz2,jz1);
856             dx22             = _mm256_sub_ps(ix2,jx2);
857             dy22             = _mm256_sub_ps(iy2,jy2);
858             dz22             = _mm256_sub_ps(iz2,jz2);
859
860             /* Calculate squared distance and things based on it */
861             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
862             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
863             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
864             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
865             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
866             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
867             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
868             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
869             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
870
871             rinv00           = avx256_invsqrt_f(rsq00);
872             rinv01           = avx256_invsqrt_f(rsq01);
873             rinv02           = avx256_invsqrt_f(rsq02);
874             rinv10           = avx256_invsqrt_f(rsq10);
875             rinv11           = avx256_invsqrt_f(rsq11);
876             rinv12           = avx256_invsqrt_f(rsq12);
877             rinv20           = avx256_invsqrt_f(rsq20);
878             rinv21           = avx256_invsqrt_f(rsq21);
879             rinv22           = avx256_invsqrt_f(rsq22);
880
881             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
882             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
883             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
884             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
885             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
886             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
887             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
888             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
889             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
890
891             fjx0             = _mm256_setzero_ps();
892             fjy0             = _mm256_setzero_ps();
893             fjz0             = _mm256_setzero_ps();
894             fjx1             = _mm256_setzero_ps();
895             fjy1             = _mm256_setzero_ps();
896             fjz1             = _mm256_setzero_ps();
897             fjx2             = _mm256_setzero_ps();
898             fjy2             = _mm256_setzero_ps();
899             fjz2             = _mm256_setzero_ps();
900
901             /**************************
902              * CALCULATE INTERACTIONS *
903              **************************/
904
905             if (gmx_mm256_any_lt(rsq00,rcutoff2))
906             {
907
908             r00              = _mm256_mul_ps(rsq00,rinv00);
909             r00              = _mm256_andnot_ps(dummy_mask,r00);
910
911             /* EWALD ELECTROSTATICS */
912             
913             /* Analytical PME correction */
914             zeta2            = _mm256_mul_ps(beta2,rsq00);
915             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
916             pmecorrF         = avx256_pmecorrF_f(zeta2);
917             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
918             felec            = _mm256_mul_ps(qq00,felec);
919             pmecorrV         = avx256_pmecorrV_f(zeta2);
920             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
921             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv00,sh_ewald),pmecorrV);
922             velec            = _mm256_mul_ps(qq00,velec);
923             
924             /* LENNARD-JONES DISPERSION/REPULSION */
925
926             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
927             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
928             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
929             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
930                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
931             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
932
933             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
934
935             /* Update potential sum for this i atom from the interaction with this j atom. */
936             velec            = _mm256_and_ps(velec,cutoff_mask);
937             velec            = _mm256_andnot_ps(dummy_mask,velec);
938             velecsum         = _mm256_add_ps(velecsum,velec);
939             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
940             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
941             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
942
943             fscal            = _mm256_add_ps(felec,fvdw);
944
945             fscal            = _mm256_and_ps(fscal,cutoff_mask);
946
947             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
948
949             /* Calculate temporary vectorial force */
950             tx               = _mm256_mul_ps(fscal,dx00);
951             ty               = _mm256_mul_ps(fscal,dy00);
952             tz               = _mm256_mul_ps(fscal,dz00);
953
954             /* Update vectorial force */
955             fix0             = _mm256_add_ps(fix0,tx);
956             fiy0             = _mm256_add_ps(fiy0,ty);
957             fiz0             = _mm256_add_ps(fiz0,tz);
958
959             fjx0             = _mm256_add_ps(fjx0,tx);
960             fjy0             = _mm256_add_ps(fjy0,ty);
961             fjz0             = _mm256_add_ps(fjz0,tz);
962
963             }
964
965             /**************************
966              * CALCULATE INTERACTIONS *
967              **************************/
968
969             if (gmx_mm256_any_lt(rsq01,rcutoff2))
970             {
971
972             r01              = _mm256_mul_ps(rsq01,rinv01);
973             r01              = _mm256_andnot_ps(dummy_mask,r01);
974
975             /* EWALD ELECTROSTATICS */
976             
977             /* Analytical PME correction */
978             zeta2            = _mm256_mul_ps(beta2,rsq01);
979             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
980             pmecorrF         = avx256_pmecorrF_f(zeta2);
981             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
982             felec            = _mm256_mul_ps(qq01,felec);
983             pmecorrV         = avx256_pmecorrV_f(zeta2);
984             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
985             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv01,sh_ewald),pmecorrV);
986             velec            = _mm256_mul_ps(qq01,velec);
987             
988             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
989
990             /* Update potential sum for this i atom from the interaction with this j atom. */
991             velec            = _mm256_and_ps(velec,cutoff_mask);
992             velec            = _mm256_andnot_ps(dummy_mask,velec);
993             velecsum         = _mm256_add_ps(velecsum,velec);
994
995             fscal            = felec;
996
997             fscal            = _mm256_and_ps(fscal,cutoff_mask);
998
999             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1000
1001             /* Calculate temporary vectorial force */
1002             tx               = _mm256_mul_ps(fscal,dx01);
1003             ty               = _mm256_mul_ps(fscal,dy01);
1004             tz               = _mm256_mul_ps(fscal,dz01);
1005
1006             /* Update vectorial force */
1007             fix0             = _mm256_add_ps(fix0,tx);
1008             fiy0             = _mm256_add_ps(fiy0,ty);
1009             fiz0             = _mm256_add_ps(fiz0,tz);
1010
1011             fjx1             = _mm256_add_ps(fjx1,tx);
1012             fjy1             = _mm256_add_ps(fjy1,ty);
1013             fjz1             = _mm256_add_ps(fjz1,tz);
1014
1015             }
1016
1017             /**************************
1018              * CALCULATE INTERACTIONS *
1019              **************************/
1020
1021             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1022             {
1023
1024             r02              = _mm256_mul_ps(rsq02,rinv02);
1025             r02              = _mm256_andnot_ps(dummy_mask,r02);
1026
1027             /* EWALD ELECTROSTATICS */
1028             
1029             /* Analytical PME correction */
1030             zeta2            = _mm256_mul_ps(beta2,rsq02);
1031             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1032             pmecorrF         = avx256_pmecorrF_f(zeta2);
1033             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1034             felec            = _mm256_mul_ps(qq02,felec);
1035             pmecorrV         = avx256_pmecorrV_f(zeta2);
1036             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1037             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv02,sh_ewald),pmecorrV);
1038             velec            = _mm256_mul_ps(qq02,velec);
1039             
1040             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1041
1042             /* Update potential sum for this i atom from the interaction with this j atom. */
1043             velec            = _mm256_and_ps(velec,cutoff_mask);
1044             velec            = _mm256_andnot_ps(dummy_mask,velec);
1045             velecsum         = _mm256_add_ps(velecsum,velec);
1046
1047             fscal            = felec;
1048
1049             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1050
1051             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1052
1053             /* Calculate temporary vectorial force */
1054             tx               = _mm256_mul_ps(fscal,dx02);
1055             ty               = _mm256_mul_ps(fscal,dy02);
1056             tz               = _mm256_mul_ps(fscal,dz02);
1057
1058             /* Update vectorial force */
1059             fix0             = _mm256_add_ps(fix0,tx);
1060             fiy0             = _mm256_add_ps(fiy0,ty);
1061             fiz0             = _mm256_add_ps(fiz0,tz);
1062
1063             fjx2             = _mm256_add_ps(fjx2,tx);
1064             fjy2             = _mm256_add_ps(fjy2,ty);
1065             fjz2             = _mm256_add_ps(fjz2,tz);
1066
1067             }
1068
1069             /**************************
1070              * CALCULATE INTERACTIONS *
1071              **************************/
1072
1073             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1074             {
1075
1076             r10              = _mm256_mul_ps(rsq10,rinv10);
1077             r10              = _mm256_andnot_ps(dummy_mask,r10);
1078
1079             /* EWALD ELECTROSTATICS */
1080             
1081             /* Analytical PME correction */
1082             zeta2            = _mm256_mul_ps(beta2,rsq10);
1083             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1084             pmecorrF         = avx256_pmecorrF_f(zeta2);
1085             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1086             felec            = _mm256_mul_ps(qq10,felec);
1087             pmecorrV         = avx256_pmecorrV_f(zeta2);
1088             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1089             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv10,sh_ewald),pmecorrV);
1090             velec            = _mm256_mul_ps(qq10,velec);
1091             
1092             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1093
1094             /* Update potential sum for this i atom from the interaction with this j atom. */
1095             velec            = _mm256_and_ps(velec,cutoff_mask);
1096             velec            = _mm256_andnot_ps(dummy_mask,velec);
1097             velecsum         = _mm256_add_ps(velecsum,velec);
1098
1099             fscal            = felec;
1100
1101             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1102
1103             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1104
1105             /* Calculate temporary vectorial force */
1106             tx               = _mm256_mul_ps(fscal,dx10);
1107             ty               = _mm256_mul_ps(fscal,dy10);
1108             tz               = _mm256_mul_ps(fscal,dz10);
1109
1110             /* Update vectorial force */
1111             fix1             = _mm256_add_ps(fix1,tx);
1112             fiy1             = _mm256_add_ps(fiy1,ty);
1113             fiz1             = _mm256_add_ps(fiz1,tz);
1114
1115             fjx0             = _mm256_add_ps(fjx0,tx);
1116             fjy0             = _mm256_add_ps(fjy0,ty);
1117             fjz0             = _mm256_add_ps(fjz0,tz);
1118
1119             }
1120
1121             /**************************
1122              * CALCULATE INTERACTIONS *
1123              **************************/
1124
1125             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1126             {
1127
1128             r11              = _mm256_mul_ps(rsq11,rinv11);
1129             r11              = _mm256_andnot_ps(dummy_mask,r11);
1130
1131             /* EWALD ELECTROSTATICS */
1132             
1133             /* Analytical PME correction */
1134             zeta2            = _mm256_mul_ps(beta2,rsq11);
1135             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1136             pmecorrF         = avx256_pmecorrF_f(zeta2);
1137             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1138             felec            = _mm256_mul_ps(qq11,felec);
1139             pmecorrV         = avx256_pmecorrV_f(zeta2);
1140             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1141             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
1142             velec            = _mm256_mul_ps(qq11,velec);
1143             
1144             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1145
1146             /* Update potential sum for this i atom from the interaction with this j atom. */
1147             velec            = _mm256_and_ps(velec,cutoff_mask);
1148             velec            = _mm256_andnot_ps(dummy_mask,velec);
1149             velecsum         = _mm256_add_ps(velecsum,velec);
1150
1151             fscal            = felec;
1152
1153             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1154
1155             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1156
1157             /* Calculate temporary vectorial force */
1158             tx               = _mm256_mul_ps(fscal,dx11);
1159             ty               = _mm256_mul_ps(fscal,dy11);
1160             tz               = _mm256_mul_ps(fscal,dz11);
1161
1162             /* Update vectorial force */
1163             fix1             = _mm256_add_ps(fix1,tx);
1164             fiy1             = _mm256_add_ps(fiy1,ty);
1165             fiz1             = _mm256_add_ps(fiz1,tz);
1166
1167             fjx1             = _mm256_add_ps(fjx1,tx);
1168             fjy1             = _mm256_add_ps(fjy1,ty);
1169             fjz1             = _mm256_add_ps(fjz1,tz);
1170
1171             }
1172
1173             /**************************
1174              * CALCULATE INTERACTIONS *
1175              **************************/
1176
1177             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1178             {
1179
1180             r12              = _mm256_mul_ps(rsq12,rinv12);
1181             r12              = _mm256_andnot_ps(dummy_mask,r12);
1182
1183             /* EWALD ELECTROSTATICS */
1184             
1185             /* Analytical PME correction */
1186             zeta2            = _mm256_mul_ps(beta2,rsq12);
1187             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1188             pmecorrF         = avx256_pmecorrF_f(zeta2);
1189             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1190             felec            = _mm256_mul_ps(qq12,felec);
1191             pmecorrV         = avx256_pmecorrV_f(zeta2);
1192             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1193             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
1194             velec            = _mm256_mul_ps(qq12,velec);
1195             
1196             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1197
1198             /* Update potential sum for this i atom from the interaction with this j atom. */
1199             velec            = _mm256_and_ps(velec,cutoff_mask);
1200             velec            = _mm256_andnot_ps(dummy_mask,velec);
1201             velecsum         = _mm256_add_ps(velecsum,velec);
1202
1203             fscal            = felec;
1204
1205             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1206
1207             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1208
1209             /* Calculate temporary vectorial force */
1210             tx               = _mm256_mul_ps(fscal,dx12);
1211             ty               = _mm256_mul_ps(fscal,dy12);
1212             tz               = _mm256_mul_ps(fscal,dz12);
1213
1214             /* Update vectorial force */
1215             fix1             = _mm256_add_ps(fix1,tx);
1216             fiy1             = _mm256_add_ps(fiy1,ty);
1217             fiz1             = _mm256_add_ps(fiz1,tz);
1218
1219             fjx2             = _mm256_add_ps(fjx2,tx);
1220             fjy2             = _mm256_add_ps(fjy2,ty);
1221             fjz2             = _mm256_add_ps(fjz2,tz);
1222
1223             }
1224
1225             /**************************
1226              * CALCULATE INTERACTIONS *
1227              **************************/
1228
1229             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1230             {
1231
1232             r20              = _mm256_mul_ps(rsq20,rinv20);
1233             r20              = _mm256_andnot_ps(dummy_mask,r20);
1234
1235             /* EWALD ELECTROSTATICS */
1236             
1237             /* Analytical PME correction */
1238             zeta2            = _mm256_mul_ps(beta2,rsq20);
1239             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1240             pmecorrF         = avx256_pmecorrF_f(zeta2);
1241             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1242             felec            = _mm256_mul_ps(qq20,felec);
1243             pmecorrV         = avx256_pmecorrV_f(zeta2);
1244             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1245             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv20,sh_ewald),pmecorrV);
1246             velec            = _mm256_mul_ps(qq20,velec);
1247             
1248             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1249
1250             /* Update potential sum for this i atom from the interaction with this j atom. */
1251             velec            = _mm256_and_ps(velec,cutoff_mask);
1252             velec            = _mm256_andnot_ps(dummy_mask,velec);
1253             velecsum         = _mm256_add_ps(velecsum,velec);
1254
1255             fscal            = felec;
1256
1257             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1258
1259             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1260
1261             /* Calculate temporary vectorial force */
1262             tx               = _mm256_mul_ps(fscal,dx20);
1263             ty               = _mm256_mul_ps(fscal,dy20);
1264             tz               = _mm256_mul_ps(fscal,dz20);
1265
1266             /* Update vectorial force */
1267             fix2             = _mm256_add_ps(fix2,tx);
1268             fiy2             = _mm256_add_ps(fiy2,ty);
1269             fiz2             = _mm256_add_ps(fiz2,tz);
1270
1271             fjx0             = _mm256_add_ps(fjx0,tx);
1272             fjy0             = _mm256_add_ps(fjy0,ty);
1273             fjz0             = _mm256_add_ps(fjz0,tz);
1274
1275             }
1276
1277             /**************************
1278              * CALCULATE INTERACTIONS *
1279              **************************/
1280
1281             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1282             {
1283
1284             r21              = _mm256_mul_ps(rsq21,rinv21);
1285             r21              = _mm256_andnot_ps(dummy_mask,r21);
1286
1287             /* EWALD ELECTROSTATICS */
1288             
1289             /* Analytical PME correction */
1290             zeta2            = _mm256_mul_ps(beta2,rsq21);
1291             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1292             pmecorrF         = avx256_pmecorrF_f(zeta2);
1293             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1294             felec            = _mm256_mul_ps(qq21,felec);
1295             pmecorrV         = avx256_pmecorrV_f(zeta2);
1296             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1297             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
1298             velec            = _mm256_mul_ps(qq21,velec);
1299             
1300             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1301
1302             /* Update potential sum for this i atom from the interaction with this j atom. */
1303             velec            = _mm256_and_ps(velec,cutoff_mask);
1304             velec            = _mm256_andnot_ps(dummy_mask,velec);
1305             velecsum         = _mm256_add_ps(velecsum,velec);
1306
1307             fscal            = felec;
1308
1309             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1310
1311             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1312
1313             /* Calculate temporary vectorial force */
1314             tx               = _mm256_mul_ps(fscal,dx21);
1315             ty               = _mm256_mul_ps(fscal,dy21);
1316             tz               = _mm256_mul_ps(fscal,dz21);
1317
1318             /* Update vectorial force */
1319             fix2             = _mm256_add_ps(fix2,tx);
1320             fiy2             = _mm256_add_ps(fiy2,ty);
1321             fiz2             = _mm256_add_ps(fiz2,tz);
1322
1323             fjx1             = _mm256_add_ps(fjx1,tx);
1324             fjy1             = _mm256_add_ps(fjy1,ty);
1325             fjz1             = _mm256_add_ps(fjz1,tz);
1326
1327             }
1328
1329             /**************************
1330              * CALCULATE INTERACTIONS *
1331              **************************/
1332
1333             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1334             {
1335
1336             r22              = _mm256_mul_ps(rsq22,rinv22);
1337             r22              = _mm256_andnot_ps(dummy_mask,r22);
1338
1339             /* EWALD ELECTROSTATICS */
1340             
1341             /* Analytical PME correction */
1342             zeta2            = _mm256_mul_ps(beta2,rsq22);
1343             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1344             pmecorrF         = avx256_pmecorrF_f(zeta2);
1345             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1346             felec            = _mm256_mul_ps(qq22,felec);
1347             pmecorrV         = avx256_pmecorrV_f(zeta2);
1348             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1349             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
1350             velec            = _mm256_mul_ps(qq22,velec);
1351             
1352             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1353
1354             /* Update potential sum for this i atom from the interaction with this j atom. */
1355             velec            = _mm256_and_ps(velec,cutoff_mask);
1356             velec            = _mm256_andnot_ps(dummy_mask,velec);
1357             velecsum         = _mm256_add_ps(velecsum,velec);
1358
1359             fscal            = felec;
1360
1361             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1362
1363             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1364
1365             /* Calculate temporary vectorial force */
1366             tx               = _mm256_mul_ps(fscal,dx22);
1367             ty               = _mm256_mul_ps(fscal,dy22);
1368             tz               = _mm256_mul_ps(fscal,dz22);
1369
1370             /* Update vectorial force */
1371             fix2             = _mm256_add_ps(fix2,tx);
1372             fiy2             = _mm256_add_ps(fiy2,ty);
1373             fiz2             = _mm256_add_ps(fiz2,tz);
1374
1375             fjx2             = _mm256_add_ps(fjx2,tx);
1376             fjy2             = _mm256_add_ps(fjy2,ty);
1377             fjz2             = _mm256_add_ps(fjz2,tz);
1378
1379             }
1380
1381             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1382             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1383             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1384             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1385             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1386             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1387             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1388             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1389
1390             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1391                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1392
1393             /* Inner loop uses 1008 flops */
1394         }
1395
1396         /* End of innermost loop */
1397
1398         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1399                                                  f+i_coord_offset,fshift+i_shift_offset);
1400
1401         ggid                        = gid[iidx];
1402         /* Update potential energies */
1403         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1404         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1405
1406         /* Increment number of inner iterations */
1407         inneriter                  += j_index_end - j_index_start;
1408
1409         /* Outer loop uses 20 flops */
1410     }
1411
1412     /* Increment number of outer iterations */
1413     outeriter        += nri;
1414
1415     /* Update outer/inner flops */
1416
1417     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*1008);
1418 }
1419 /*
1420  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_single
1421  * Electrostatics interaction: Ewald
1422  * VdW interaction:            LennardJones
1423  * Geometry:                   Water3-Water3
1424  * Calculate force/pot:        Force
1425  */
1426 void
1427 nb_kernel_ElecEwSh_VdwLJSh_GeomW3W3_F_avx_256_single
1428                     (t_nblist                    * gmx_restrict       nlist,
1429                      rvec                        * gmx_restrict          xx,
1430                      rvec                        * gmx_restrict          ff,
1431                      struct t_forcerec           * gmx_restrict          fr,
1432                      t_mdatoms                   * gmx_restrict     mdatoms,
1433                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1434                      t_nrnb                      * gmx_restrict        nrnb)
1435 {
1436     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1437      * just 0 for non-waters.
1438      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1439      * jnr indices corresponding to data put in the four positions in the SIMD register.
1440      */
1441     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1442     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1443     int              jnrA,jnrB,jnrC,jnrD;
1444     int              jnrE,jnrF,jnrG,jnrH;
1445     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1446     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1447     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1448     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1449     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1450     real             rcutoff_scalar;
1451     real             *shiftvec,*fshift,*x,*f;
1452     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1453     real             scratch[4*DIM];
1454     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1455     real *           vdwioffsetptr0;
1456     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1457     real *           vdwioffsetptr1;
1458     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1459     real *           vdwioffsetptr2;
1460     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1461     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1462     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1463     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1464     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1465     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1466     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1467     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1468     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1469     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1470     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1471     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1472     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1473     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1474     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1475     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1476     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1477     real             *charge;
1478     int              nvdwtype;
1479     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1480     int              *vdwtype;
1481     real             *vdwparam;
1482     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1483     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1484     __m256i          ewitab;
1485     __m128i          ewitab_lo,ewitab_hi;
1486     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1487     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1488     real             *ewtab;
1489     __m256           dummy_mask,cutoff_mask;
1490     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1491     __m256           one     = _mm256_set1_ps(1.0);
1492     __m256           two     = _mm256_set1_ps(2.0);
1493     x                = xx[0];
1494     f                = ff[0];
1495
1496     nri              = nlist->nri;
1497     iinr             = nlist->iinr;
1498     jindex           = nlist->jindex;
1499     jjnr             = nlist->jjnr;
1500     shiftidx         = nlist->shift;
1501     gid              = nlist->gid;
1502     shiftvec         = fr->shift_vec[0];
1503     fshift           = fr->fshift[0];
1504     facel            = _mm256_set1_ps(fr->ic->epsfac);
1505     charge           = mdatoms->chargeA;
1506     nvdwtype         = fr->ntype;
1507     vdwparam         = fr->nbfp;
1508     vdwtype          = mdatoms->typeA;
1509
1510     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1511     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1512     beta2            = _mm256_mul_ps(beta,beta);
1513     beta3            = _mm256_mul_ps(beta,beta2);
1514
1515     ewtab            = fr->ic->tabq_coul_F;
1516     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1517     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1518
1519     /* Setup water-specific parameters */
1520     inr              = nlist->iinr[0];
1521     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1522     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1523     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1524     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1525
1526     jq0              = _mm256_set1_ps(charge[inr+0]);
1527     jq1              = _mm256_set1_ps(charge[inr+1]);
1528     jq2              = _mm256_set1_ps(charge[inr+2]);
1529     vdwjidx0A        = 2*vdwtype[inr+0];
1530     qq00             = _mm256_mul_ps(iq0,jq0);
1531     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1532     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1533     qq01             = _mm256_mul_ps(iq0,jq1);
1534     qq02             = _mm256_mul_ps(iq0,jq2);
1535     qq10             = _mm256_mul_ps(iq1,jq0);
1536     qq11             = _mm256_mul_ps(iq1,jq1);
1537     qq12             = _mm256_mul_ps(iq1,jq2);
1538     qq20             = _mm256_mul_ps(iq2,jq0);
1539     qq21             = _mm256_mul_ps(iq2,jq1);
1540     qq22             = _mm256_mul_ps(iq2,jq2);
1541
1542     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1543     rcutoff_scalar   = fr->ic->rcoulomb;
1544     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1545     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1546
1547     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
1548     rvdw             = _mm256_set1_ps(fr->ic->rvdw);
1549
1550     /* Avoid stupid compiler warnings */
1551     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1552     j_coord_offsetA = 0;
1553     j_coord_offsetB = 0;
1554     j_coord_offsetC = 0;
1555     j_coord_offsetD = 0;
1556     j_coord_offsetE = 0;
1557     j_coord_offsetF = 0;
1558     j_coord_offsetG = 0;
1559     j_coord_offsetH = 0;
1560
1561     outeriter        = 0;
1562     inneriter        = 0;
1563
1564     for(iidx=0;iidx<4*DIM;iidx++)
1565     {
1566         scratch[iidx] = 0.0;
1567     }
1568
1569     /* Start outer loop over neighborlists */
1570     for(iidx=0; iidx<nri; iidx++)
1571     {
1572         /* Load shift vector for this list */
1573         i_shift_offset   = DIM*shiftidx[iidx];
1574
1575         /* Load limits for loop over neighbors */
1576         j_index_start    = jindex[iidx];
1577         j_index_end      = jindex[iidx+1];
1578
1579         /* Get outer coordinate index */
1580         inr              = iinr[iidx];
1581         i_coord_offset   = DIM*inr;
1582
1583         /* Load i particle coords and add shift vector */
1584         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1585                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1586
1587         fix0             = _mm256_setzero_ps();
1588         fiy0             = _mm256_setzero_ps();
1589         fiz0             = _mm256_setzero_ps();
1590         fix1             = _mm256_setzero_ps();
1591         fiy1             = _mm256_setzero_ps();
1592         fiz1             = _mm256_setzero_ps();
1593         fix2             = _mm256_setzero_ps();
1594         fiy2             = _mm256_setzero_ps();
1595         fiz2             = _mm256_setzero_ps();
1596
1597         /* Start inner kernel loop */
1598         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1599         {
1600
1601             /* Get j neighbor index, and coordinate index */
1602             jnrA             = jjnr[jidx];
1603             jnrB             = jjnr[jidx+1];
1604             jnrC             = jjnr[jidx+2];
1605             jnrD             = jjnr[jidx+3];
1606             jnrE             = jjnr[jidx+4];
1607             jnrF             = jjnr[jidx+5];
1608             jnrG             = jjnr[jidx+6];
1609             jnrH             = jjnr[jidx+7];
1610             j_coord_offsetA  = DIM*jnrA;
1611             j_coord_offsetB  = DIM*jnrB;
1612             j_coord_offsetC  = DIM*jnrC;
1613             j_coord_offsetD  = DIM*jnrD;
1614             j_coord_offsetE  = DIM*jnrE;
1615             j_coord_offsetF  = DIM*jnrF;
1616             j_coord_offsetG  = DIM*jnrG;
1617             j_coord_offsetH  = DIM*jnrH;
1618
1619             /* load j atom coordinates */
1620             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1621                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1622                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1623                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1624                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1625
1626             /* Calculate displacement vector */
1627             dx00             = _mm256_sub_ps(ix0,jx0);
1628             dy00             = _mm256_sub_ps(iy0,jy0);
1629             dz00             = _mm256_sub_ps(iz0,jz0);
1630             dx01             = _mm256_sub_ps(ix0,jx1);
1631             dy01             = _mm256_sub_ps(iy0,jy1);
1632             dz01             = _mm256_sub_ps(iz0,jz1);
1633             dx02             = _mm256_sub_ps(ix0,jx2);
1634             dy02             = _mm256_sub_ps(iy0,jy2);
1635             dz02             = _mm256_sub_ps(iz0,jz2);
1636             dx10             = _mm256_sub_ps(ix1,jx0);
1637             dy10             = _mm256_sub_ps(iy1,jy0);
1638             dz10             = _mm256_sub_ps(iz1,jz0);
1639             dx11             = _mm256_sub_ps(ix1,jx1);
1640             dy11             = _mm256_sub_ps(iy1,jy1);
1641             dz11             = _mm256_sub_ps(iz1,jz1);
1642             dx12             = _mm256_sub_ps(ix1,jx2);
1643             dy12             = _mm256_sub_ps(iy1,jy2);
1644             dz12             = _mm256_sub_ps(iz1,jz2);
1645             dx20             = _mm256_sub_ps(ix2,jx0);
1646             dy20             = _mm256_sub_ps(iy2,jy0);
1647             dz20             = _mm256_sub_ps(iz2,jz0);
1648             dx21             = _mm256_sub_ps(ix2,jx1);
1649             dy21             = _mm256_sub_ps(iy2,jy1);
1650             dz21             = _mm256_sub_ps(iz2,jz1);
1651             dx22             = _mm256_sub_ps(ix2,jx2);
1652             dy22             = _mm256_sub_ps(iy2,jy2);
1653             dz22             = _mm256_sub_ps(iz2,jz2);
1654
1655             /* Calculate squared distance and things based on it */
1656             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1657             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1658             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1659             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1660             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1661             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1662             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1663             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1664             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1665
1666             rinv00           = avx256_invsqrt_f(rsq00);
1667             rinv01           = avx256_invsqrt_f(rsq01);
1668             rinv02           = avx256_invsqrt_f(rsq02);
1669             rinv10           = avx256_invsqrt_f(rsq10);
1670             rinv11           = avx256_invsqrt_f(rsq11);
1671             rinv12           = avx256_invsqrt_f(rsq12);
1672             rinv20           = avx256_invsqrt_f(rsq20);
1673             rinv21           = avx256_invsqrt_f(rsq21);
1674             rinv22           = avx256_invsqrt_f(rsq22);
1675
1676             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1677             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1678             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1679             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1680             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1681             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1682             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1683             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1684             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1685
1686             fjx0             = _mm256_setzero_ps();
1687             fjy0             = _mm256_setzero_ps();
1688             fjz0             = _mm256_setzero_ps();
1689             fjx1             = _mm256_setzero_ps();
1690             fjy1             = _mm256_setzero_ps();
1691             fjz1             = _mm256_setzero_ps();
1692             fjx2             = _mm256_setzero_ps();
1693             fjy2             = _mm256_setzero_ps();
1694             fjz2             = _mm256_setzero_ps();
1695
1696             /**************************
1697              * CALCULATE INTERACTIONS *
1698              **************************/
1699
1700             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1701             {
1702
1703             r00              = _mm256_mul_ps(rsq00,rinv00);
1704
1705             /* EWALD ELECTROSTATICS */
1706             
1707             /* Analytical PME correction */
1708             zeta2            = _mm256_mul_ps(beta2,rsq00);
1709             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1710             pmecorrF         = avx256_pmecorrF_f(zeta2);
1711             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1712             felec            = _mm256_mul_ps(qq00,felec);
1713             
1714             /* LENNARD-JONES DISPERSION/REPULSION */
1715
1716             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1717             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1718
1719             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1720
1721             fscal            = _mm256_add_ps(felec,fvdw);
1722
1723             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1724
1725             /* Calculate temporary vectorial force */
1726             tx               = _mm256_mul_ps(fscal,dx00);
1727             ty               = _mm256_mul_ps(fscal,dy00);
1728             tz               = _mm256_mul_ps(fscal,dz00);
1729
1730             /* Update vectorial force */
1731             fix0             = _mm256_add_ps(fix0,tx);
1732             fiy0             = _mm256_add_ps(fiy0,ty);
1733             fiz0             = _mm256_add_ps(fiz0,tz);
1734
1735             fjx0             = _mm256_add_ps(fjx0,tx);
1736             fjy0             = _mm256_add_ps(fjy0,ty);
1737             fjz0             = _mm256_add_ps(fjz0,tz);
1738
1739             }
1740
1741             /**************************
1742              * CALCULATE INTERACTIONS *
1743              **************************/
1744
1745             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1746             {
1747
1748             r01              = _mm256_mul_ps(rsq01,rinv01);
1749
1750             /* EWALD ELECTROSTATICS */
1751             
1752             /* Analytical PME correction */
1753             zeta2            = _mm256_mul_ps(beta2,rsq01);
1754             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1755             pmecorrF         = avx256_pmecorrF_f(zeta2);
1756             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1757             felec            = _mm256_mul_ps(qq01,felec);
1758             
1759             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1760
1761             fscal            = felec;
1762
1763             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1764
1765             /* Calculate temporary vectorial force */
1766             tx               = _mm256_mul_ps(fscal,dx01);
1767             ty               = _mm256_mul_ps(fscal,dy01);
1768             tz               = _mm256_mul_ps(fscal,dz01);
1769
1770             /* Update vectorial force */
1771             fix0             = _mm256_add_ps(fix0,tx);
1772             fiy0             = _mm256_add_ps(fiy0,ty);
1773             fiz0             = _mm256_add_ps(fiz0,tz);
1774
1775             fjx1             = _mm256_add_ps(fjx1,tx);
1776             fjy1             = _mm256_add_ps(fjy1,ty);
1777             fjz1             = _mm256_add_ps(fjz1,tz);
1778
1779             }
1780
1781             /**************************
1782              * CALCULATE INTERACTIONS *
1783              **************************/
1784
1785             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1786             {
1787
1788             r02              = _mm256_mul_ps(rsq02,rinv02);
1789
1790             /* EWALD ELECTROSTATICS */
1791             
1792             /* Analytical PME correction */
1793             zeta2            = _mm256_mul_ps(beta2,rsq02);
1794             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1795             pmecorrF         = avx256_pmecorrF_f(zeta2);
1796             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1797             felec            = _mm256_mul_ps(qq02,felec);
1798             
1799             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1800
1801             fscal            = felec;
1802
1803             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1804
1805             /* Calculate temporary vectorial force */
1806             tx               = _mm256_mul_ps(fscal,dx02);
1807             ty               = _mm256_mul_ps(fscal,dy02);
1808             tz               = _mm256_mul_ps(fscal,dz02);
1809
1810             /* Update vectorial force */
1811             fix0             = _mm256_add_ps(fix0,tx);
1812             fiy0             = _mm256_add_ps(fiy0,ty);
1813             fiz0             = _mm256_add_ps(fiz0,tz);
1814
1815             fjx2             = _mm256_add_ps(fjx2,tx);
1816             fjy2             = _mm256_add_ps(fjy2,ty);
1817             fjz2             = _mm256_add_ps(fjz2,tz);
1818
1819             }
1820
1821             /**************************
1822              * CALCULATE INTERACTIONS *
1823              **************************/
1824
1825             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1826             {
1827
1828             r10              = _mm256_mul_ps(rsq10,rinv10);
1829
1830             /* EWALD ELECTROSTATICS */
1831             
1832             /* Analytical PME correction */
1833             zeta2            = _mm256_mul_ps(beta2,rsq10);
1834             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1835             pmecorrF         = avx256_pmecorrF_f(zeta2);
1836             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1837             felec            = _mm256_mul_ps(qq10,felec);
1838             
1839             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1840
1841             fscal            = felec;
1842
1843             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1844
1845             /* Calculate temporary vectorial force */
1846             tx               = _mm256_mul_ps(fscal,dx10);
1847             ty               = _mm256_mul_ps(fscal,dy10);
1848             tz               = _mm256_mul_ps(fscal,dz10);
1849
1850             /* Update vectorial force */
1851             fix1             = _mm256_add_ps(fix1,tx);
1852             fiy1             = _mm256_add_ps(fiy1,ty);
1853             fiz1             = _mm256_add_ps(fiz1,tz);
1854
1855             fjx0             = _mm256_add_ps(fjx0,tx);
1856             fjy0             = _mm256_add_ps(fjy0,ty);
1857             fjz0             = _mm256_add_ps(fjz0,tz);
1858
1859             }
1860
1861             /**************************
1862              * CALCULATE INTERACTIONS *
1863              **************************/
1864
1865             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1866             {
1867
1868             r11              = _mm256_mul_ps(rsq11,rinv11);
1869
1870             /* EWALD ELECTROSTATICS */
1871             
1872             /* Analytical PME correction */
1873             zeta2            = _mm256_mul_ps(beta2,rsq11);
1874             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1875             pmecorrF         = avx256_pmecorrF_f(zeta2);
1876             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1877             felec            = _mm256_mul_ps(qq11,felec);
1878             
1879             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1880
1881             fscal            = felec;
1882
1883             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1884
1885             /* Calculate temporary vectorial force */
1886             tx               = _mm256_mul_ps(fscal,dx11);
1887             ty               = _mm256_mul_ps(fscal,dy11);
1888             tz               = _mm256_mul_ps(fscal,dz11);
1889
1890             /* Update vectorial force */
1891             fix1             = _mm256_add_ps(fix1,tx);
1892             fiy1             = _mm256_add_ps(fiy1,ty);
1893             fiz1             = _mm256_add_ps(fiz1,tz);
1894
1895             fjx1             = _mm256_add_ps(fjx1,tx);
1896             fjy1             = _mm256_add_ps(fjy1,ty);
1897             fjz1             = _mm256_add_ps(fjz1,tz);
1898
1899             }
1900
1901             /**************************
1902              * CALCULATE INTERACTIONS *
1903              **************************/
1904
1905             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1906             {
1907
1908             r12              = _mm256_mul_ps(rsq12,rinv12);
1909
1910             /* EWALD ELECTROSTATICS */
1911             
1912             /* Analytical PME correction */
1913             zeta2            = _mm256_mul_ps(beta2,rsq12);
1914             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1915             pmecorrF         = avx256_pmecorrF_f(zeta2);
1916             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1917             felec            = _mm256_mul_ps(qq12,felec);
1918             
1919             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1920
1921             fscal            = felec;
1922
1923             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1924
1925             /* Calculate temporary vectorial force */
1926             tx               = _mm256_mul_ps(fscal,dx12);
1927             ty               = _mm256_mul_ps(fscal,dy12);
1928             tz               = _mm256_mul_ps(fscal,dz12);
1929
1930             /* Update vectorial force */
1931             fix1             = _mm256_add_ps(fix1,tx);
1932             fiy1             = _mm256_add_ps(fiy1,ty);
1933             fiz1             = _mm256_add_ps(fiz1,tz);
1934
1935             fjx2             = _mm256_add_ps(fjx2,tx);
1936             fjy2             = _mm256_add_ps(fjy2,ty);
1937             fjz2             = _mm256_add_ps(fjz2,tz);
1938
1939             }
1940
1941             /**************************
1942              * CALCULATE INTERACTIONS *
1943              **************************/
1944
1945             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1946             {
1947
1948             r20              = _mm256_mul_ps(rsq20,rinv20);
1949
1950             /* EWALD ELECTROSTATICS */
1951             
1952             /* Analytical PME correction */
1953             zeta2            = _mm256_mul_ps(beta2,rsq20);
1954             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1955             pmecorrF         = avx256_pmecorrF_f(zeta2);
1956             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1957             felec            = _mm256_mul_ps(qq20,felec);
1958             
1959             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1960
1961             fscal            = felec;
1962
1963             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1964
1965             /* Calculate temporary vectorial force */
1966             tx               = _mm256_mul_ps(fscal,dx20);
1967             ty               = _mm256_mul_ps(fscal,dy20);
1968             tz               = _mm256_mul_ps(fscal,dz20);
1969
1970             /* Update vectorial force */
1971             fix2             = _mm256_add_ps(fix2,tx);
1972             fiy2             = _mm256_add_ps(fiy2,ty);
1973             fiz2             = _mm256_add_ps(fiz2,tz);
1974
1975             fjx0             = _mm256_add_ps(fjx0,tx);
1976             fjy0             = _mm256_add_ps(fjy0,ty);
1977             fjz0             = _mm256_add_ps(fjz0,tz);
1978
1979             }
1980
1981             /**************************
1982              * CALCULATE INTERACTIONS *
1983              **************************/
1984
1985             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1986             {
1987
1988             r21              = _mm256_mul_ps(rsq21,rinv21);
1989
1990             /* EWALD ELECTROSTATICS */
1991             
1992             /* Analytical PME correction */
1993             zeta2            = _mm256_mul_ps(beta2,rsq21);
1994             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1995             pmecorrF         = avx256_pmecorrF_f(zeta2);
1996             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1997             felec            = _mm256_mul_ps(qq21,felec);
1998             
1999             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2000
2001             fscal            = felec;
2002
2003             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2004
2005             /* Calculate temporary vectorial force */
2006             tx               = _mm256_mul_ps(fscal,dx21);
2007             ty               = _mm256_mul_ps(fscal,dy21);
2008             tz               = _mm256_mul_ps(fscal,dz21);
2009
2010             /* Update vectorial force */
2011             fix2             = _mm256_add_ps(fix2,tx);
2012             fiy2             = _mm256_add_ps(fiy2,ty);
2013             fiz2             = _mm256_add_ps(fiz2,tz);
2014
2015             fjx1             = _mm256_add_ps(fjx1,tx);
2016             fjy1             = _mm256_add_ps(fjy1,ty);
2017             fjz1             = _mm256_add_ps(fjz1,tz);
2018
2019             }
2020
2021             /**************************
2022              * CALCULATE INTERACTIONS *
2023              **************************/
2024
2025             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2026             {
2027
2028             r22              = _mm256_mul_ps(rsq22,rinv22);
2029
2030             /* EWALD ELECTROSTATICS */
2031             
2032             /* Analytical PME correction */
2033             zeta2            = _mm256_mul_ps(beta2,rsq22);
2034             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2035             pmecorrF         = avx256_pmecorrF_f(zeta2);
2036             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2037             felec            = _mm256_mul_ps(qq22,felec);
2038             
2039             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2040
2041             fscal            = felec;
2042
2043             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2044
2045             /* Calculate temporary vectorial force */
2046             tx               = _mm256_mul_ps(fscal,dx22);
2047             ty               = _mm256_mul_ps(fscal,dy22);
2048             tz               = _mm256_mul_ps(fscal,dz22);
2049
2050             /* Update vectorial force */
2051             fix2             = _mm256_add_ps(fix2,tx);
2052             fiy2             = _mm256_add_ps(fiy2,ty);
2053             fiz2             = _mm256_add_ps(fiz2,tz);
2054
2055             fjx2             = _mm256_add_ps(fjx2,tx);
2056             fjy2             = _mm256_add_ps(fjy2,ty);
2057             fjz2             = _mm256_add_ps(fjz2,tz);
2058
2059             }
2060
2061             fjptrA             = f+j_coord_offsetA;
2062             fjptrB             = f+j_coord_offsetB;
2063             fjptrC             = f+j_coord_offsetC;
2064             fjptrD             = f+j_coord_offsetD;
2065             fjptrE             = f+j_coord_offsetE;
2066             fjptrF             = f+j_coord_offsetF;
2067             fjptrG             = f+j_coord_offsetG;
2068             fjptrH             = f+j_coord_offsetH;
2069
2070             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2071                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2072
2073             /* Inner loop uses 538 flops */
2074         }
2075
2076         if(jidx<j_index_end)
2077         {
2078
2079             /* Get j neighbor index, and coordinate index */
2080             jnrlistA         = jjnr[jidx];
2081             jnrlistB         = jjnr[jidx+1];
2082             jnrlistC         = jjnr[jidx+2];
2083             jnrlistD         = jjnr[jidx+3];
2084             jnrlistE         = jjnr[jidx+4];
2085             jnrlistF         = jjnr[jidx+5];
2086             jnrlistG         = jjnr[jidx+6];
2087             jnrlistH         = jjnr[jidx+7];
2088             /* Sign of each element will be negative for non-real atoms.
2089              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2090              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2091              */
2092             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2093                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2094                                             
2095             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2096             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2097             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2098             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2099             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2100             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2101             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2102             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2103             j_coord_offsetA  = DIM*jnrA;
2104             j_coord_offsetB  = DIM*jnrB;
2105             j_coord_offsetC  = DIM*jnrC;
2106             j_coord_offsetD  = DIM*jnrD;
2107             j_coord_offsetE  = DIM*jnrE;
2108             j_coord_offsetF  = DIM*jnrF;
2109             j_coord_offsetG  = DIM*jnrG;
2110             j_coord_offsetH  = DIM*jnrH;
2111
2112             /* load j atom coordinates */
2113             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2114                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2115                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2116                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2117                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2118
2119             /* Calculate displacement vector */
2120             dx00             = _mm256_sub_ps(ix0,jx0);
2121             dy00             = _mm256_sub_ps(iy0,jy0);
2122             dz00             = _mm256_sub_ps(iz0,jz0);
2123             dx01             = _mm256_sub_ps(ix0,jx1);
2124             dy01             = _mm256_sub_ps(iy0,jy1);
2125             dz01             = _mm256_sub_ps(iz0,jz1);
2126             dx02             = _mm256_sub_ps(ix0,jx2);
2127             dy02             = _mm256_sub_ps(iy0,jy2);
2128             dz02             = _mm256_sub_ps(iz0,jz2);
2129             dx10             = _mm256_sub_ps(ix1,jx0);
2130             dy10             = _mm256_sub_ps(iy1,jy0);
2131             dz10             = _mm256_sub_ps(iz1,jz0);
2132             dx11             = _mm256_sub_ps(ix1,jx1);
2133             dy11             = _mm256_sub_ps(iy1,jy1);
2134             dz11             = _mm256_sub_ps(iz1,jz1);
2135             dx12             = _mm256_sub_ps(ix1,jx2);
2136             dy12             = _mm256_sub_ps(iy1,jy2);
2137             dz12             = _mm256_sub_ps(iz1,jz2);
2138             dx20             = _mm256_sub_ps(ix2,jx0);
2139             dy20             = _mm256_sub_ps(iy2,jy0);
2140             dz20             = _mm256_sub_ps(iz2,jz0);
2141             dx21             = _mm256_sub_ps(ix2,jx1);
2142             dy21             = _mm256_sub_ps(iy2,jy1);
2143             dz21             = _mm256_sub_ps(iz2,jz1);
2144             dx22             = _mm256_sub_ps(ix2,jx2);
2145             dy22             = _mm256_sub_ps(iy2,jy2);
2146             dz22             = _mm256_sub_ps(iz2,jz2);
2147
2148             /* Calculate squared distance and things based on it */
2149             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2150             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2151             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2152             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2153             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2154             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2155             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2156             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2157             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2158
2159             rinv00           = avx256_invsqrt_f(rsq00);
2160             rinv01           = avx256_invsqrt_f(rsq01);
2161             rinv02           = avx256_invsqrt_f(rsq02);
2162             rinv10           = avx256_invsqrt_f(rsq10);
2163             rinv11           = avx256_invsqrt_f(rsq11);
2164             rinv12           = avx256_invsqrt_f(rsq12);
2165             rinv20           = avx256_invsqrt_f(rsq20);
2166             rinv21           = avx256_invsqrt_f(rsq21);
2167             rinv22           = avx256_invsqrt_f(rsq22);
2168
2169             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
2170             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
2171             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
2172             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
2173             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2174             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2175             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
2176             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2177             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2178
2179             fjx0             = _mm256_setzero_ps();
2180             fjy0             = _mm256_setzero_ps();
2181             fjz0             = _mm256_setzero_ps();
2182             fjx1             = _mm256_setzero_ps();
2183             fjy1             = _mm256_setzero_ps();
2184             fjz1             = _mm256_setzero_ps();
2185             fjx2             = _mm256_setzero_ps();
2186             fjy2             = _mm256_setzero_ps();
2187             fjz2             = _mm256_setzero_ps();
2188
2189             /**************************
2190              * CALCULATE INTERACTIONS *
2191              **************************/
2192
2193             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2194             {
2195
2196             r00              = _mm256_mul_ps(rsq00,rinv00);
2197             r00              = _mm256_andnot_ps(dummy_mask,r00);
2198
2199             /* EWALD ELECTROSTATICS */
2200             
2201             /* Analytical PME correction */
2202             zeta2            = _mm256_mul_ps(beta2,rsq00);
2203             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
2204             pmecorrF         = avx256_pmecorrF_f(zeta2);
2205             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2206             felec            = _mm256_mul_ps(qq00,felec);
2207             
2208             /* LENNARD-JONES DISPERSION/REPULSION */
2209
2210             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2211             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2212
2213             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2214
2215             fscal            = _mm256_add_ps(felec,fvdw);
2216
2217             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2218
2219             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2220
2221             /* Calculate temporary vectorial force */
2222             tx               = _mm256_mul_ps(fscal,dx00);
2223             ty               = _mm256_mul_ps(fscal,dy00);
2224             tz               = _mm256_mul_ps(fscal,dz00);
2225
2226             /* Update vectorial force */
2227             fix0             = _mm256_add_ps(fix0,tx);
2228             fiy0             = _mm256_add_ps(fiy0,ty);
2229             fiz0             = _mm256_add_ps(fiz0,tz);
2230
2231             fjx0             = _mm256_add_ps(fjx0,tx);
2232             fjy0             = _mm256_add_ps(fjy0,ty);
2233             fjz0             = _mm256_add_ps(fjz0,tz);
2234
2235             }
2236
2237             /**************************
2238              * CALCULATE INTERACTIONS *
2239              **************************/
2240
2241             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2242             {
2243
2244             r01              = _mm256_mul_ps(rsq01,rinv01);
2245             r01              = _mm256_andnot_ps(dummy_mask,r01);
2246
2247             /* EWALD ELECTROSTATICS */
2248             
2249             /* Analytical PME correction */
2250             zeta2            = _mm256_mul_ps(beta2,rsq01);
2251             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
2252             pmecorrF         = avx256_pmecorrF_f(zeta2);
2253             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2254             felec            = _mm256_mul_ps(qq01,felec);
2255             
2256             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2257
2258             fscal            = felec;
2259
2260             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2261
2262             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2263
2264             /* Calculate temporary vectorial force */
2265             tx               = _mm256_mul_ps(fscal,dx01);
2266             ty               = _mm256_mul_ps(fscal,dy01);
2267             tz               = _mm256_mul_ps(fscal,dz01);
2268
2269             /* Update vectorial force */
2270             fix0             = _mm256_add_ps(fix0,tx);
2271             fiy0             = _mm256_add_ps(fiy0,ty);
2272             fiz0             = _mm256_add_ps(fiz0,tz);
2273
2274             fjx1             = _mm256_add_ps(fjx1,tx);
2275             fjy1             = _mm256_add_ps(fjy1,ty);
2276             fjz1             = _mm256_add_ps(fjz1,tz);
2277
2278             }
2279
2280             /**************************
2281              * CALCULATE INTERACTIONS *
2282              **************************/
2283
2284             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2285             {
2286
2287             r02              = _mm256_mul_ps(rsq02,rinv02);
2288             r02              = _mm256_andnot_ps(dummy_mask,r02);
2289
2290             /* EWALD ELECTROSTATICS */
2291             
2292             /* Analytical PME correction */
2293             zeta2            = _mm256_mul_ps(beta2,rsq02);
2294             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
2295             pmecorrF         = avx256_pmecorrF_f(zeta2);
2296             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2297             felec            = _mm256_mul_ps(qq02,felec);
2298             
2299             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2300
2301             fscal            = felec;
2302
2303             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2304
2305             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2306
2307             /* Calculate temporary vectorial force */
2308             tx               = _mm256_mul_ps(fscal,dx02);
2309             ty               = _mm256_mul_ps(fscal,dy02);
2310             tz               = _mm256_mul_ps(fscal,dz02);
2311
2312             /* Update vectorial force */
2313             fix0             = _mm256_add_ps(fix0,tx);
2314             fiy0             = _mm256_add_ps(fiy0,ty);
2315             fiz0             = _mm256_add_ps(fiz0,tz);
2316
2317             fjx2             = _mm256_add_ps(fjx2,tx);
2318             fjy2             = _mm256_add_ps(fjy2,ty);
2319             fjz2             = _mm256_add_ps(fjz2,tz);
2320
2321             }
2322
2323             /**************************
2324              * CALCULATE INTERACTIONS *
2325              **************************/
2326
2327             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2328             {
2329
2330             r10              = _mm256_mul_ps(rsq10,rinv10);
2331             r10              = _mm256_andnot_ps(dummy_mask,r10);
2332
2333             /* EWALD ELECTROSTATICS */
2334             
2335             /* Analytical PME correction */
2336             zeta2            = _mm256_mul_ps(beta2,rsq10);
2337             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2338             pmecorrF         = avx256_pmecorrF_f(zeta2);
2339             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2340             felec            = _mm256_mul_ps(qq10,felec);
2341             
2342             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2343
2344             fscal            = felec;
2345
2346             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2347
2348             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2349
2350             /* Calculate temporary vectorial force */
2351             tx               = _mm256_mul_ps(fscal,dx10);
2352             ty               = _mm256_mul_ps(fscal,dy10);
2353             tz               = _mm256_mul_ps(fscal,dz10);
2354
2355             /* Update vectorial force */
2356             fix1             = _mm256_add_ps(fix1,tx);
2357             fiy1             = _mm256_add_ps(fiy1,ty);
2358             fiz1             = _mm256_add_ps(fiz1,tz);
2359
2360             fjx0             = _mm256_add_ps(fjx0,tx);
2361             fjy0             = _mm256_add_ps(fjy0,ty);
2362             fjz0             = _mm256_add_ps(fjz0,tz);
2363
2364             }
2365
2366             /**************************
2367              * CALCULATE INTERACTIONS *
2368              **************************/
2369
2370             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2371             {
2372
2373             r11              = _mm256_mul_ps(rsq11,rinv11);
2374             r11              = _mm256_andnot_ps(dummy_mask,r11);
2375
2376             /* EWALD ELECTROSTATICS */
2377             
2378             /* Analytical PME correction */
2379             zeta2            = _mm256_mul_ps(beta2,rsq11);
2380             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2381             pmecorrF         = avx256_pmecorrF_f(zeta2);
2382             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2383             felec            = _mm256_mul_ps(qq11,felec);
2384             
2385             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2386
2387             fscal            = felec;
2388
2389             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2390
2391             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2392
2393             /* Calculate temporary vectorial force */
2394             tx               = _mm256_mul_ps(fscal,dx11);
2395             ty               = _mm256_mul_ps(fscal,dy11);
2396             tz               = _mm256_mul_ps(fscal,dz11);
2397
2398             /* Update vectorial force */
2399             fix1             = _mm256_add_ps(fix1,tx);
2400             fiy1             = _mm256_add_ps(fiy1,ty);
2401             fiz1             = _mm256_add_ps(fiz1,tz);
2402
2403             fjx1             = _mm256_add_ps(fjx1,tx);
2404             fjy1             = _mm256_add_ps(fjy1,ty);
2405             fjz1             = _mm256_add_ps(fjz1,tz);
2406
2407             }
2408
2409             /**************************
2410              * CALCULATE INTERACTIONS *
2411              **************************/
2412
2413             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2414             {
2415
2416             r12              = _mm256_mul_ps(rsq12,rinv12);
2417             r12              = _mm256_andnot_ps(dummy_mask,r12);
2418
2419             /* EWALD ELECTROSTATICS */
2420             
2421             /* Analytical PME correction */
2422             zeta2            = _mm256_mul_ps(beta2,rsq12);
2423             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2424             pmecorrF         = avx256_pmecorrF_f(zeta2);
2425             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2426             felec            = _mm256_mul_ps(qq12,felec);
2427             
2428             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2429
2430             fscal            = felec;
2431
2432             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2433
2434             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2435
2436             /* Calculate temporary vectorial force */
2437             tx               = _mm256_mul_ps(fscal,dx12);
2438             ty               = _mm256_mul_ps(fscal,dy12);
2439             tz               = _mm256_mul_ps(fscal,dz12);
2440
2441             /* Update vectorial force */
2442             fix1             = _mm256_add_ps(fix1,tx);
2443             fiy1             = _mm256_add_ps(fiy1,ty);
2444             fiz1             = _mm256_add_ps(fiz1,tz);
2445
2446             fjx2             = _mm256_add_ps(fjx2,tx);
2447             fjy2             = _mm256_add_ps(fjy2,ty);
2448             fjz2             = _mm256_add_ps(fjz2,tz);
2449
2450             }
2451
2452             /**************************
2453              * CALCULATE INTERACTIONS *
2454              **************************/
2455
2456             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2457             {
2458
2459             r20              = _mm256_mul_ps(rsq20,rinv20);
2460             r20              = _mm256_andnot_ps(dummy_mask,r20);
2461
2462             /* EWALD ELECTROSTATICS */
2463             
2464             /* Analytical PME correction */
2465             zeta2            = _mm256_mul_ps(beta2,rsq20);
2466             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2467             pmecorrF         = avx256_pmecorrF_f(zeta2);
2468             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2469             felec            = _mm256_mul_ps(qq20,felec);
2470             
2471             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2472
2473             fscal            = felec;
2474
2475             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2476
2477             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2478
2479             /* Calculate temporary vectorial force */
2480             tx               = _mm256_mul_ps(fscal,dx20);
2481             ty               = _mm256_mul_ps(fscal,dy20);
2482             tz               = _mm256_mul_ps(fscal,dz20);
2483
2484             /* Update vectorial force */
2485             fix2             = _mm256_add_ps(fix2,tx);
2486             fiy2             = _mm256_add_ps(fiy2,ty);
2487             fiz2             = _mm256_add_ps(fiz2,tz);
2488
2489             fjx0             = _mm256_add_ps(fjx0,tx);
2490             fjy0             = _mm256_add_ps(fjy0,ty);
2491             fjz0             = _mm256_add_ps(fjz0,tz);
2492
2493             }
2494
2495             /**************************
2496              * CALCULATE INTERACTIONS *
2497              **************************/
2498
2499             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2500             {
2501
2502             r21              = _mm256_mul_ps(rsq21,rinv21);
2503             r21              = _mm256_andnot_ps(dummy_mask,r21);
2504
2505             /* EWALD ELECTROSTATICS */
2506             
2507             /* Analytical PME correction */
2508             zeta2            = _mm256_mul_ps(beta2,rsq21);
2509             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2510             pmecorrF         = avx256_pmecorrF_f(zeta2);
2511             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2512             felec            = _mm256_mul_ps(qq21,felec);
2513             
2514             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2515
2516             fscal            = felec;
2517
2518             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2519
2520             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2521
2522             /* Calculate temporary vectorial force */
2523             tx               = _mm256_mul_ps(fscal,dx21);
2524             ty               = _mm256_mul_ps(fscal,dy21);
2525             tz               = _mm256_mul_ps(fscal,dz21);
2526
2527             /* Update vectorial force */
2528             fix2             = _mm256_add_ps(fix2,tx);
2529             fiy2             = _mm256_add_ps(fiy2,ty);
2530             fiz2             = _mm256_add_ps(fiz2,tz);
2531
2532             fjx1             = _mm256_add_ps(fjx1,tx);
2533             fjy1             = _mm256_add_ps(fjy1,ty);
2534             fjz1             = _mm256_add_ps(fjz1,tz);
2535
2536             }
2537
2538             /**************************
2539              * CALCULATE INTERACTIONS *
2540              **************************/
2541
2542             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2543             {
2544
2545             r22              = _mm256_mul_ps(rsq22,rinv22);
2546             r22              = _mm256_andnot_ps(dummy_mask,r22);
2547
2548             /* EWALD ELECTROSTATICS */
2549             
2550             /* Analytical PME correction */
2551             zeta2            = _mm256_mul_ps(beta2,rsq22);
2552             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2553             pmecorrF         = avx256_pmecorrF_f(zeta2);
2554             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2555             felec            = _mm256_mul_ps(qq22,felec);
2556             
2557             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2558
2559             fscal            = felec;
2560
2561             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2562
2563             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2564
2565             /* Calculate temporary vectorial force */
2566             tx               = _mm256_mul_ps(fscal,dx22);
2567             ty               = _mm256_mul_ps(fscal,dy22);
2568             tz               = _mm256_mul_ps(fscal,dz22);
2569
2570             /* Update vectorial force */
2571             fix2             = _mm256_add_ps(fix2,tx);
2572             fiy2             = _mm256_add_ps(fiy2,ty);
2573             fiz2             = _mm256_add_ps(fiz2,tz);
2574
2575             fjx2             = _mm256_add_ps(fjx2,tx);
2576             fjy2             = _mm256_add_ps(fjy2,ty);
2577             fjz2             = _mm256_add_ps(fjz2,tz);
2578
2579             }
2580
2581             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2582             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2583             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2584             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2585             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2586             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2587             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2588             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2589
2590             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2591                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2592
2593             /* Inner loop uses 547 flops */
2594         }
2595
2596         /* End of innermost loop */
2597
2598         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2599                                                  f+i_coord_offset,fshift+i_shift_offset);
2600
2601         /* Increment number of inner iterations */
2602         inneriter                  += j_index_end - j_index_start;
2603
2604         /* Outer loop uses 18 flops */
2605     }
2606
2607     /* Increment number of outer iterations */
2608     outeriter        += nri;
2609
2610     /* Update outer/inner flops */
2611
2612     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*547);
2613 }