48a7e8138c00502fb3b4775aba7030b0f7c55558
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gmx_math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrE,jnrF,jnrG,jnrH;
78     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
83     real             rcutoff_scalar;
84     real             *shiftvec,*fshift,*x,*f;
85     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
86     real             scratch[4*DIM];
87     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88     real *           vdwioffsetptr0;
89     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90     real *           vdwioffsetptr1;
91     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92     real *           vdwioffsetptr2;
93     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
102     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
103     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
104     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
105     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
106     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
107     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
108     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
109     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
116     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
117     __m256i          ewitab;
118     __m128i          ewitab_lo,ewitab_hi;
119     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
120     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
121     real             *ewtab;
122     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
123     real             rswitch_scalar,d_scalar;
124     __m256           dummy_mask,cutoff_mask;
125     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
126     __m256           one     = _mm256_set1_ps(1.0);
127     __m256           two     = _mm256_set1_ps(2.0);
128     x                = xx[0];
129     f                = ff[0];
130
131     nri              = nlist->nri;
132     iinr             = nlist->iinr;
133     jindex           = nlist->jindex;
134     jjnr             = nlist->jjnr;
135     shiftidx         = nlist->shift;
136     gid              = nlist->gid;
137     shiftvec         = fr->shift_vec[0];
138     fshift           = fr->fshift[0];
139     facel            = _mm256_set1_ps(fr->epsfac);
140     charge           = mdatoms->chargeA;
141     nvdwtype         = fr->ntype;
142     vdwparam         = fr->nbfp;
143     vdwtype          = mdatoms->typeA;
144
145     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
146     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
147     beta2            = _mm256_mul_ps(beta,beta);
148     beta3            = _mm256_mul_ps(beta,beta2);
149
150     ewtab            = fr->ic->tabq_coul_FDV0;
151     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
152     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
153
154     /* Setup water-specific parameters */
155     inr              = nlist->iinr[0];
156     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
157     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
158     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
159     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
160
161     jq0              = _mm256_set1_ps(charge[inr+0]);
162     jq1              = _mm256_set1_ps(charge[inr+1]);
163     jq2              = _mm256_set1_ps(charge[inr+2]);
164     vdwjidx0A        = 2*vdwtype[inr+0];
165     qq00             = _mm256_mul_ps(iq0,jq0);
166     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
167     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
168     qq01             = _mm256_mul_ps(iq0,jq1);
169     qq02             = _mm256_mul_ps(iq0,jq2);
170     qq10             = _mm256_mul_ps(iq1,jq0);
171     qq11             = _mm256_mul_ps(iq1,jq1);
172     qq12             = _mm256_mul_ps(iq1,jq2);
173     qq20             = _mm256_mul_ps(iq2,jq0);
174     qq21             = _mm256_mul_ps(iq2,jq1);
175     qq22             = _mm256_mul_ps(iq2,jq2);
176
177     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
178     rcutoff_scalar   = fr->rcoulomb;
179     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
180     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
181
182     rswitch_scalar   = fr->rcoulomb_switch;
183     rswitch          = _mm256_set1_ps(rswitch_scalar);
184     /* Setup switch parameters */
185     d_scalar         = rcutoff_scalar-rswitch_scalar;
186     d                = _mm256_set1_ps(d_scalar);
187     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
188     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
189     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
190     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
191     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
192     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
193
194     /* Avoid stupid compiler warnings */
195     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
196     j_coord_offsetA = 0;
197     j_coord_offsetB = 0;
198     j_coord_offsetC = 0;
199     j_coord_offsetD = 0;
200     j_coord_offsetE = 0;
201     j_coord_offsetF = 0;
202     j_coord_offsetG = 0;
203     j_coord_offsetH = 0;
204
205     outeriter        = 0;
206     inneriter        = 0;
207
208     for(iidx=0;iidx<4*DIM;iidx++)
209     {
210         scratch[iidx] = 0.0;
211     }
212
213     /* Start outer loop over neighborlists */
214     for(iidx=0; iidx<nri; iidx++)
215     {
216         /* Load shift vector for this list */
217         i_shift_offset   = DIM*shiftidx[iidx];
218
219         /* Load limits for loop over neighbors */
220         j_index_start    = jindex[iidx];
221         j_index_end      = jindex[iidx+1];
222
223         /* Get outer coordinate index */
224         inr              = iinr[iidx];
225         i_coord_offset   = DIM*inr;
226
227         /* Load i particle coords and add shift vector */
228         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
229                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
230
231         fix0             = _mm256_setzero_ps();
232         fiy0             = _mm256_setzero_ps();
233         fiz0             = _mm256_setzero_ps();
234         fix1             = _mm256_setzero_ps();
235         fiy1             = _mm256_setzero_ps();
236         fiz1             = _mm256_setzero_ps();
237         fix2             = _mm256_setzero_ps();
238         fiy2             = _mm256_setzero_ps();
239         fiz2             = _mm256_setzero_ps();
240
241         /* Reset potential sums */
242         velecsum         = _mm256_setzero_ps();
243         vvdwsum          = _mm256_setzero_ps();
244
245         /* Start inner kernel loop */
246         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
247         {
248
249             /* Get j neighbor index, and coordinate index */
250             jnrA             = jjnr[jidx];
251             jnrB             = jjnr[jidx+1];
252             jnrC             = jjnr[jidx+2];
253             jnrD             = jjnr[jidx+3];
254             jnrE             = jjnr[jidx+4];
255             jnrF             = jjnr[jidx+5];
256             jnrG             = jjnr[jidx+6];
257             jnrH             = jjnr[jidx+7];
258             j_coord_offsetA  = DIM*jnrA;
259             j_coord_offsetB  = DIM*jnrB;
260             j_coord_offsetC  = DIM*jnrC;
261             j_coord_offsetD  = DIM*jnrD;
262             j_coord_offsetE  = DIM*jnrE;
263             j_coord_offsetF  = DIM*jnrF;
264             j_coord_offsetG  = DIM*jnrG;
265             j_coord_offsetH  = DIM*jnrH;
266
267             /* load j atom coordinates */
268             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
269                                                  x+j_coord_offsetC,x+j_coord_offsetD,
270                                                  x+j_coord_offsetE,x+j_coord_offsetF,
271                                                  x+j_coord_offsetG,x+j_coord_offsetH,
272                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
273
274             /* Calculate displacement vector */
275             dx00             = _mm256_sub_ps(ix0,jx0);
276             dy00             = _mm256_sub_ps(iy0,jy0);
277             dz00             = _mm256_sub_ps(iz0,jz0);
278             dx01             = _mm256_sub_ps(ix0,jx1);
279             dy01             = _mm256_sub_ps(iy0,jy1);
280             dz01             = _mm256_sub_ps(iz0,jz1);
281             dx02             = _mm256_sub_ps(ix0,jx2);
282             dy02             = _mm256_sub_ps(iy0,jy2);
283             dz02             = _mm256_sub_ps(iz0,jz2);
284             dx10             = _mm256_sub_ps(ix1,jx0);
285             dy10             = _mm256_sub_ps(iy1,jy0);
286             dz10             = _mm256_sub_ps(iz1,jz0);
287             dx11             = _mm256_sub_ps(ix1,jx1);
288             dy11             = _mm256_sub_ps(iy1,jy1);
289             dz11             = _mm256_sub_ps(iz1,jz1);
290             dx12             = _mm256_sub_ps(ix1,jx2);
291             dy12             = _mm256_sub_ps(iy1,jy2);
292             dz12             = _mm256_sub_ps(iz1,jz2);
293             dx20             = _mm256_sub_ps(ix2,jx0);
294             dy20             = _mm256_sub_ps(iy2,jy0);
295             dz20             = _mm256_sub_ps(iz2,jz0);
296             dx21             = _mm256_sub_ps(ix2,jx1);
297             dy21             = _mm256_sub_ps(iy2,jy1);
298             dz21             = _mm256_sub_ps(iz2,jz1);
299             dx22             = _mm256_sub_ps(ix2,jx2);
300             dy22             = _mm256_sub_ps(iy2,jy2);
301             dz22             = _mm256_sub_ps(iz2,jz2);
302
303             /* Calculate squared distance and things based on it */
304             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
305             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
306             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
307             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
308             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
309             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
310             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
311             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
312             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
313
314             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
315             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
316             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
317             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
318             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
319             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
320             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
321             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
322             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
323
324             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
325             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
326             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
327             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
328             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
329             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
330             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
331             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
332             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
333
334             fjx0             = _mm256_setzero_ps();
335             fjy0             = _mm256_setzero_ps();
336             fjz0             = _mm256_setzero_ps();
337             fjx1             = _mm256_setzero_ps();
338             fjy1             = _mm256_setzero_ps();
339             fjz1             = _mm256_setzero_ps();
340             fjx2             = _mm256_setzero_ps();
341             fjy2             = _mm256_setzero_ps();
342             fjz2             = _mm256_setzero_ps();
343
344             /**************************
345              * CALCULATE INTERACTIONS *
346              **************************/
347
348             if (gmx_mm256_any_lt(rsq00,rcutoff2))
349             {
350
351             r00              = _mm256_mul_ps(rsq00,rinv00);
352
353             /* EWALD ELECTROSTATICS */
354             
355             /* Analytical PME correction */
356             zeta2            = _mm256_mul_ps(beta2,rsq00);
357             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
358             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
359             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
360             felec            = _mm256_mul_ps(qq00,felec);
361             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
362             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
363             velec            = _mm256_sub_ps(rinv00,pmecorrV);
364             velec            = _mm256_mul_ps(qq00,velec);
365             
366             /* LENNARD-JONES DISPERSION/REPULSION */
367
368             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
369             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
370             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
371             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
372             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
373
374             d                = _mm256_sub_ps(r00,rswitch);
375             d                = _mm256_max_ps(d,_mm256_setzero_ps());
376             d2               = _mm256_mul_ps(d,d);
377             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
378
379             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
380
381             /* Evaluate switch function */
382             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
383             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
384             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
385             velec            = _mm256_mul_ps(velec,sw);
386             vvdw             = _mm256_mul_ps(vvdw,sw);
387             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
388
389             /* Update potential sum for this i atom from the interaction with this j atom. */
390             velec            = _mm256_and_ps(velec,cutoff_mask);
391             velecsum         = _mm256_add_ps(velecsum,velec);
392             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
393             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
394
395             fscal            = _mm256_add_ps(felec,fvdw);
396
397             fscal            = _mm256_and_ps(fscal,cutoff_mask);
398
399             /* Calculate temporary vectorial force */
400             tx               = _mm256_mul_ps(fscal,dx00);
401             ty               = _mm256_mul_ps(fscal,dy00);
402             tz               = _mm256_mul_ps(fscal,dz00);
403
404             /* Update vectorial force */
405             fix0             = _mm256_add_ps(fix0,tx);
406             fiy0             = _mm256_add_ps(fiy0,ty);
407             fiz0             = _mm256_add_ps(fiz0,tz);
408
409             fjx0             = _mm256_add_ps(fjx0,tx);
410             fjy0             = _mm256_add_ps(fjy0,ty);
411             fjz0             = _mm256_add_ps(fjz0,tz);
412
413             }
414
415             /**************************
416              * CALCULATE INTERACTIONS *
417              **************************/
418
419             if (gmx_mm256_any_lt(rsq01,rcutoff2))
420             {
421
422             r01              = _mm256_mul_ps(rsq01,rinv01);
423
424             /* EWALD ELECTROSTATICS */
425             
426             /* Analytical PME correction */
427             zeta2            = _mm256_mul_ps(beta2,rsq01);
428             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
429             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
430             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
431             felec            = _mm256_mul_ps(qq01,felec);
432             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
433             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
434             velec            = _mm256_sub_ps(rinv01,pmecorrV);
435             velec            = _mm256_mul_ps(qq01,velec);
436             
437             d                = _mm256_sub_ps(r01,rswitch);
438             d                = _mm256_max_ps(d,_mm256_setzero_ps());
439             d2               = _mm256_mul_ps(d,d);
440             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
441
442             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
443
444             /* Evaluate switch function */
445             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
446             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
447             velec            = _mm256_mul_ps(velec,sw);
448             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
449
450             /* Update potential sum for this i atom from the interaction with this j atom. */
451             velec            = _mm256_and_ps(velec,cutoff_mask);
452             velecsum         = _mm256_add_ps(velecsum,velec);
453
454             fscal            = felec;
455
456             fscal            = _mm256_and_ps(fscal,cutoff_mask);
457
458             /* Calculate temporary vectorial force */
459             tx               = _mm256_mul_ps(fscal,dx01);
460             ty               = _mm256_mul_ps(fscal,dy01);
461             tz               = _mm256_mul_ps(fscal,dz01);
462
463             /* Update vectorial force */
464             fix0             = _mm256_add_ps(fix0,tx);
465             fiy0             = _mm256_add_ps(fiy0,ty);
466             fiz0             = _mm256_add_ps(fiz0,tz);
467
468             fjx1             = _mm256_add_ps(fjx1,tx);
469             fjy1             = _mm256_add_ps(fjy1,ty);
470             fjz1             = _mm256_add_ps(fjz1,tz);
471
472             }
473
474             /**************************
475              * CALCULATE INTERACTIONS *
476              **************************/
477
478             if (gmx_mm256_any_lt(rsq02,rcutoff2))
479             {
480
481             r02              = _mm256_mul_ps(rsq02,rinv02);
482
483             /* EWALD ELECTROSTATICS */
484             
485             /* Analytical PME correction */
486             zeta2            = _mm256_mul_ps(beta2,rsq02);
487             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
488             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
489             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
490             felec            = _mm256_mul_ps(qq02,felec);
491             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
492             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
493             velec            = _mm256_sub_ps(rinv02,pmecorrV);
494             velec            = _mm256_mul_ps(qq02,velec);
495             
496             d                = _mm256_sub_ps(r02,rswitch);
497             d                = _mm256_max_ps(d,_mm256_setzero_ps());
498             d2               = _mm256_mul_ps(d,d);
499             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
500
501             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
502
503             /* Evaluate switch function */
504             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
505             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
506             velec            = _mm256_mul_ps(velec,sw);
507             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
508
509             /* Update potential sum for this i atom from the interaction with this j atom. */
510             velec            = _mm256_and_ps(velec,cutoff_mask);
511             velecsum         = _mm256_add_ps(velecsum,velec);
512
513             fscal            = felec;
514
515             fscal            = _mm256_and_ps(fscal,cutoff_mask);
516
517             /* Calculate temporary vectorial force */
518             tx               = _mm256_mul_ps(fscal,dx02);
519             ty               = _mm256_mul_ps(fscal,dy02);
520             tz               = _mm256_mul_ps(fscal,dz02);
521
522             /* Update vectorial force */
523             fix0             = _mm256_add_ps(fix0,tx);
524             fiy0             = _mm256_add_ps(fiy0,ty);
525             fiz0             = _mm256_add_ps(fiz0,tz);
526
527             fjx2             = _mm256_add_ps(fjx2,tx);
528             fjy2             = _mm256_add_ps(fjy2,ty);
529             fjz2             = _mm256_add_ps(fjz2,tz);
530
531             }
532
533             /**************************
534              * CALCULATE INTERACTIONS *
535              **************************/
536
537             if (gmx_mm256_any_lt(rsq10,rcutoff2))
538             {
539
540             r10              = _mm256_mul_ps(rsq10,rinv10);
541
542             /* EWALD ELECTROSTATICS */
543             
544             /* Analytical PME correction */
545             zeta2            = _mm256_mul_ps(beta2,rsq10);
546             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
547             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
548             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
549             felec            = _mm256_mul_ps(qq10,felec);
550             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
551             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
552             velec            = _mm256_sub_ps(rinv10,pmecorrV);
553             velec            = _mm256_mul_ps(qq10,velec);
554             
555             d                = _mm256_sub_ps(r10,rswitch);
556             d                = _mm256_max_ps(d,_mm256_setzero_ps());
557             d2               = _mm256_mul_ps(d,d);
558             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
559
560             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
561
562             /* Evaluate switch function */
563             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
564             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
565             velec            = _mm256_mul_ps(velec,sw);
566             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
567
568             /* Update potential sum for this i atom from the interaction with this j atom. */
569             velec            = _mm256_and_ps(velec,cutoff_mask);
570             velecsum         = _mm256_add_ps(velecsum,velec);
571
572             fscal            = felec;
573
574             fscal            = _mm256_and_ps(fscal,cutoff_mask);
575
576             /* Calculate temporary vectorial force */
577             tx               = _mm256_mul_ps(fscal,dx10);
578             ty               = _mm256_mul_ps(fscal,dy10);
579             tz               = _mm256_mul_ps(fscal,dz10);
580
581             /* Update vectorial force */
582             fix1             = _mm256_add_ps(fix1,tx);
583             fiy1             = _mm256_add_ps(fiy1,ty);
584             fiz1             = _mm256_add_ps(fiz1,tz);
585
586             fjx0             = _mm256_add_ps(fjx0,tx);
587             fjy0             = _mm256_add_ps(fjy0,ty);
588             fjz0             = _mm256_add_ps(fjz0,tz);
589
590             }
591
592             /**************************
593              * CALCULATE INTERACTIONS *
594              **************************/
595
596             if (gmx_mm256_any_lt(rsq11,rcutoff2))
597             {
598
599             r11              = _mm256_mul_ps(rsq11,rinv11);
600
601             /* EWALD ELECTROSTATICS */
602             
603             /* Analytical PME correction */
604             zeta2            = _mm256_mul_ps(beta2,rsq11);
605             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
606             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
607             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
608             felec            = _mm256_mul_ps(qq11,felec);
609             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
610             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
611             velec            = _mm256_sub_ps(rinv11,pmecorrV);
612             velec            = _mm256_mul_ps(qq11,velec);
613             
614             d                = _mm256_sub_ps(r11,rswitch);
615             d                = _mm256_max_ps(d,_mm256_setzero_ps());
616             d2               = _mm256_mul_ps(d,d);
617             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
618
619             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
620
621             /* Evaluate switch function */
622             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
623             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
624             velec            = _mm256_mul_ps(velec,sw);
625             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
626
627             /* Update potential sum for this i atom from the interaction with this j atom. */
628             velec            = _mm256_and_ps(velec,cutoff_mask);
629             velecsum         = _mm256_add_ps(velecsum,velec);
630
631             fscal            = felec;
632
633             fscal            = _mm256_and_ps(fscal,cutoff_mask);
634
635             /* Calculate temporary vectorial force */
636             tx               = _mm256_mul_ps(fscal,dx11);
637             ty               = _mm256_mul_ps(fscal,dy11);
638             tz               = _mm256_mul_ps(fscal,dz11);
639
640             /* Update vectorial force */
641             fix1             = _mm256_add_ps(fix1,tx);
642             fiy1             = _mm256_add_ps(fiy1,ty);
643             fiz1             = _mm256_add_ps(fiz1,tz);
644
645             fjx1             = _mm256_add_ps(fjx1,tx);
646             fjy1             = _mm256_add_ps(fjy1,ty);
647             fjz1             = _mm256_add_ps(fjz1,tz);
648
649             }
650
651             /**************************
652              * CALCULATE INTERACTIONS *
653              **************************/
654
655             if (gmx_mm256_any_lt(rsq12,rcutoff2))
656             {
657
658             r12              = _mm256_mul_ps(rsq12,rinv12);
659
660             /* EWALD ELECTROSTATICS */
661             
662             /* Analytical PME correction */
663             zeta2            = _mm256_mul_ps(beta2,rsq12);
664             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
665             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
666             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
667             felec            = _mm256_mul_ps(qq12,felec);
668             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
669             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
670             velec            = _mm256_sub_ps(rinv12,pmecorrV);
671             velec            = _mm256_mul_ps(qq12,velec);
672             
673             d                = _mm256_sub_ps(r12,rswitch);
674             d                = _mm256_max_ps(d,_mm256_setzero_ps());
675             d2               = _mm256_mul_ps(d,d);
676             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
677
678             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
679
680             /* Evaluate switch function */
681             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
682             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
683             velec            = _mm256_mul_ps(velec,sw);
684             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
685
686             /* Update potential sum for this i atom from the interaction with this j atom. */
687             velec            = _mm256_and_ps(velec,cutoff_mask);
688             velecsum         = _mm256_add_ps(velecsum,velec);
689
690             fscal            = felec;
691
692             fscal            = _mm256_and_ps(fscal,cutoff_mask);
693
694             /* Calculate temporary vectorial force */
695             tx               = _mm256_mul_ps(fscal,dx12);
696             ty               = _mm256_mul_ps(fscal,dy12);
697             tz               = _mm256_mul_ps(fscal,dz12);
698
699             /* Update vectorial force */
700             fix1             = _mm256_add_ps(fix1,tx);
701             fiy1             = _mm256_add_ps(fiy1,ty);
702             fiz1             = _mm256_add_ps(fiz1,tz);
703
704             fjx2             = _mm256_add_ps(fjx2,tx);
705             fjy2             = _mm256_add_ps(fjy2,ty);
706             fjz2             = _mm256_add_ps(fjz2,tz);
707
708             }
709
710             /**************************
711              * CALCULATE INTERACTIONS *
712              **************************/
713
714             if (gmx_mm256_any_lt(rsq20,rcutoff2))
715             {
716
717             r20              = _mm256_mul_ps(rsq20,rinv20);
718
719             /* EWALD ELECTROSTATICS */
720             
721             /* Analytical PME correction */
722             zeta2            = _mm256_mul_ps(beta2,rsq20);
723             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
724             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
725             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
726             felec            = _mm256_mul_ps(qq20,felec);
727             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
728             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
729             velec            = _mm256_sub_ps(rinv20,pmecorrV);
730             velec            = _mm256_mul_ps(qq20,velec);
731             
732             d                = _mm256_sub_ps(r20,rswitch);
733             d                = _mm256_max_ps(d,_mm256_setzero_ps());
734             d2               = _mm256_mul_ps(d,d);
735             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
736
737             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
738
739             /* Evaluate switch function */
740             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
741             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
742             velec            = _mm256_mul_ps(velec,sw);
743             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
744
745             /* Update potential sum for this i atom from the interaction with this j atom. */
746             velec            = _mm256_and_ps(velec,cutoff_mask);
747             velecsum         = _mm256_add_ps(velecsum,velec);
748
749             fscal            = felec;
750
751             fscal            = _mm256_and_ps(fscal,cutoff_mask);
752
753             /* Calculate temporary vectorial force */
754             tx               = _mm256_mul_ps(fscal,dx20);
755             ty               = _mm256_mul_ps(fscal,dy20);
756             tz               = _mm256_mul_ps(fscal,dz20);
757
758             /* Update vectorial force */
759             fix2             = _mm256_add_ps(fix2,tx);
760             fiy2             = _mm256_add_ps(fiy2,ty);
761             fiz2             = _mm256_add_ps(fiz2,tz);
762
763             fjx0             = _mm256_add_ps(fjx0,tx);
764             fjy0             = _mm256_add_ps(fjy0,ty);
765             fjz0             = _mm256_add_ps(fjz0,tz);
766
767             }
768
769             /**************************
770              * CALCULATE INTERACTIONS *
771              **************************/
772
773             if (gmx_mm256_any_lt(rsq21,rcutoff2))
774             {
775
776             r21              = _mm256_mul_ps(rsq21,rinv21);
777
778             /* EWALD ELECTROSTATICS */
779             
780             /* Analytical PME correction */
781             zeta2            = _mm256_mul_ps(beta2,rsq21);
782             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
783             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
784             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
785             felec            = _mm256_mul_ps(qq21,felec);
786             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
787             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
788             velec            = _mm256_sub_ps(rinv21,pmecorrV);
789             velec            = _mm256_mul_ps(qq21,velec);
790             
791             d                = _mm256_sub_ps(r21,rswitch);
792             d                = _mm256_max_ps(d,_mm256_setzero_ps());
793             d2               = _mm256_mul_ps(d,d);
794             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
795
796             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
797
798             /* Evaluate switch function */
799             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
800             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
801             velec            = _mm256_mul_ps(velec,sw);
802             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
803
804             /* Update potential sum for this i atom from the interaction with this j atom. */
805             velec            = _mm256_and_ps(velec,cutoff_mask);
806             velecsum         = _mm256_add_ps(velecsum,velec);
807
808             fscal            = felec;
809
810             fscal            = _mm256_and_ps(fscal,cutoff_mask);
811
812             /* Calculate temporary vectorial force */
813             tx               = _mm256_mul_ps(fscal,dx21);
814             ty               = _mm256_mul_ps(fscal,dy21);
815             tz               = _mm256_mul_ps(fscal,dz21);
816
817             /* Update vectorial force */
818             fix2             = _mm256_add_ps(fix2,tx);
819             fiy2             = _mm256_add_ps(fiy2,ty);
820             fiz2             = _mm256_add_ps(fiz2,tz);
821
822             fjx1             = _mm256_add_ps(fjx1,tx);
823             fjy1             = _mm256_add_ps(fjy1,ty);
824             fjz1             = _mm256_add_ps(fjz1,tz);
825
826             }
827
828             /**************************
829              * CALCULATE INTERACTIONS *
830              **************************/
831
832             if (gmx_mm256_any_lt(rsq22,rcutoff2))
833             {
834
835             r22              = _mm256_mul_ps(rsq22,rinv22);
836
837             /* EWALD ELECTROSTATICS */
838             
839             /* Analytical PME correction */
840             zeta2            = _mm256_mul_ps(beta2,rsq22);
841             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
842             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
843             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
844             felec            = _mm256_mul_ps(qq22,felec);
845             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
846             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
847             velec            = _mm256_sub_ps(rinv22,pmecorrV);
848             velec            = _mm256_mul_ps(qq22,velec);
849             
850             d                = _mm256_sub_ps(r22,rswitch);
851             d                = _mm256_max_ps(d,_mm256_setzero_ps());
852             d2               = _mm256_mul_ps(d,d);
853             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
854
855             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
856
857             /* Evaluate switch function */
858             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
859             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
860             velec            = _mm256_mul_ps(velec,sw);
861             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
862
863             /* Update potential sum for this i atom from the interaction with this j atom. */
864             velec            = _mm256_and_ps(velec,cutoff_mask);
865             velecsum         = _mm256_add_ps(velecsum,velec);
866
867             fscal            = felec;
868
869             fscal            = _mm256_and_ps(fscal,cutoff_mask);
870
871             /* Calculate temporary vectorial force */
872             tx               = _mm256_mul_ps(fscal,dx22);
873             ty               = _mm256_mul_ps(fscal,dy22);
874             tz               = _mm256_mul_ps(fscal,dz22);
875
876             /* Update vectorial force */
877             fix2             = _mm256_add_ps(fix2,tx);
878             fiy2             = _mm256_add_ps(fiy2,ty);
879             fiz2             = _mm256_add_ps(fiz2,tz);
880
881             fjx2             = _mm256_add_ps(fjx2,tx);
882             fjy2             = _mm256_add_ps(fjy2,ty);
883             fjz2             = _mm256_add_ps(fjz2,tz);
884
885             }
886
887             fjptrA             = f+j_coord_offsetA;
888             fjptrB             = f+j_coord_offsetB;
889             fjptrC             = f+j_coord_offsetC;
890             fjptrD             = f+j_coord_offsetD;
891             fjptrE             = f+j_coord_offsetE;
892             fjptrF             = f+j_coord_offsetF;
893             fjptrG             = f+j_coord_offsetG;
894             fjptrH             = f+j_coord_offsetH;
895
896             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
897                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
898
899             /* Inner loop uses 990 flops */
900         }
901
902         if(jidx<j_index_end)
903         {
904
905             /* Get j neighbor index, and coordinate index */
906             jnrlistA         = jjnr[jidx];
907             jnrlistB         = jjnr[jidx+1];
908             jnrlistC         = jjnr[jidx+2];
909             jnrlistD         = jjnr[jidx+3];
910             jnrlistE         = jjnr[jidx+4];
911             jnrlistF         = jjnr[jidx+5];
912             jnrlistG         = jjnr[jidx+6];
913             jnrlistH         = jjnr[jidx+7];
914             /* Sign of each element will be negative for non-real atoms.
915              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
916              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
917              */
918             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
919                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
920                                             
921             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
922             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
923             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
924             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
925             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
926             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
927             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
928             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
929             j_coord_offsetA  = DIM*jnrA;
930             j_coord_offsetB  = DIM*jnrB;
931             j_coord_offsetC  = DIM*jnrC;
932             j_coord_offsetD  = DIM*jnrD;
933             j_coord_offsetE  = DIM*jnrE;
934             j_coord_offsetF  = DIM*jnrF;
935             j_coord_offsetG  = DIM*jnrG;
936             j_coord_offsetH  = DIM*jnrH;
937
938             /* load j atom coordinates */
939             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
940                                                  x+j_coord_offsetC,x+j_coord_offsetD,
941                                                  x+j_coord_offsetE,x+j_coord_offsetF,
942                                                  x+j_coord_offsetG,x+j_coord_offsetH,
943                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
944
945             /* Calculate displacement vector */
946             dx00             = _mm256_sub_ps(ix0,jx0);
947             dy00             = _mm256_sub_ps(iy0,jy0);
948             dz00             = _mm256_sub_ps(iz0,jz0);
949             dx01             = _mm256_sub_ps(ix0,jx1);
950             dy01             = _mm256_sub_ps(iy0,jy1);
951             dz01             = _mm256_sub_ps(iz0,jz1);
952             dx02             = _mm256_sub_ps(ix0,jx2);
953             dy02             = _mm256_sub_ps(iy0,jy2);
954             dz02             = _mm256_sub_ps(iz0,jz2);
955             dx10             = _mm256_sub_ps(ix1,jx0);
956             dy10             = _mm256_sub_ps(iy1,jy0);
957             dz10             = _mm256_sub_ps(iz1,jz0);
958             dx11             = _mm256_sub_ps(ix1,jx1);
959             dy11             = _mm256_sub_ps(iy1,jy1);
960             dz11             = _mm256_sub_ps(iz1,jz1);
961             dx12             = _mm256_sub_ps(ix1,jx2);
962             dy12             = _mm256_sub_ps(iy1,jy2);
963             dz12             = _mm256_sub_ps(iz1,jz2);
964             dx20             = _mm256_sub_ps(ix2,jx0);
965             dy20             = _mm256_sub_ps(iy2,jy0);
966             dz20             = _mm256_sub_ps(iz2,jz0);
967             dx21             = _mm256_sub_ps(ix2,jx1);
968             dy21             = _mm256_sub_ps(iy2,jy1);
969             dz21             = _mm256_sub_ps(iz2,jz1);
970             dx22             = _mm256_sub_ps(ix2,jx2);
971             dy22             = _mm256_sub_ps(iy2,jy2);
972             dz22             = _mm256_sub_ps(iz2,jz2);
973
974             /* Calculate squared distance and things based on it */
975             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
976             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
977             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
978             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
979             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
980             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
981             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
982             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
983             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
984
985             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
986             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
987             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
988             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
989             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
990             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
991             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
992             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
993             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
994
995             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
996             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
997             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
998             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
999             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1000             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1001             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1002             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1003             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1004
1005             fjx0             = _mm256_setzero_ps();
1006             fjy0             = _mm256_setzero_ps();
1007             fjz0             = _mm256_setzero_ps();
1008             fjx1             = _mm256_setzero_ps();
1009             fjy1             = _mm256_setzero_ps();
1010             fjz1             = _mm256_setzero_ps();
1011             fjx2             = _mm256_setzero_ps();
1012             fjy2             = _mm256_setzero_ps();
1013             fjz2             = _mm256_setzero_ps();
1014
1015             /**************************
1016              * CALCULATE INTERACTIONS *
1017              **************************/
1018
1019             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1020             {
1021
1022             r00              = _mm256_mul_ps(rsq00,rinv00);
1023             r00              = _mm256_andnot_ps(dummy_mask,r00);
1024
1025             /* EWALD ELECTROSTATICS */
1026             
1027             /* Analytical PME correction */
1028             zeta2            = _mm256_mul_ps(beta2,rsq00);
1029             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1030             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1031             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1032             felec            = _mm256_mul_ps(qq00,felec);
1033             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1034             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1035             velec            = _mm256_sub_ps(rinv00,pmecorrV);
1036             velec            = _mm256_mul_ps(qq00,velec);
1037             
1038             /* LENNARD-JONES DISPERSION/REPULSION */
1039
1040             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1041             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
1042             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1043             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1044             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1045
1046             d                = _mm256_sub_ps(r00,rswitch);
1047             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1048             d2               = _mm256_mul_ps(d,d);
1049             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1050
1051             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1052
1053             /* Evaluate switch function */
1054             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1055             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1056             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1057             velec            = _mm256_mul_ps(velec,sw);
1058             vvdw             = _mm256_mul_ps(vvdw,sw);
1059             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1060
1061             /* Update potential sum for this i atom from the interaction with this j atom. */
1062             velec            = _mm256_and_ps(velec,cutoff_mask);
1063             velec            = _mm256_andnot_ps(dummy_mask,velec);
1064             velecsum         = _mm256_add_ps(velecsum,velec);
1065             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
1066             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
1067             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
1068
1069             fscal            = _mm256_add_ps(felec,fvdw);
1070
1071             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1072
1073             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1074
1075             /* Calculate temporary vectorial force */
1076             tx               = _mm256_mul_ps(fscal,dx00);
1077             ty               = _mm256_mul_ps(fscal,dy00);
1078             tz               = _mm256_mul_ps(fscal,dz00);
1079
1080             /* Update vectorial force */
1081             fix0             = _mm256_add_ps(fix0,tx);
1082             fiy0             = _mm256_add_ps(fiy0,ty);
1083             fiz0             = _mm256_add_ps(fiz0,tz);
1084
1085             fjx0             = _mm256_add_ps(fjx0,tx);
1086             fjy0             = _mm256_add_ps(fjy0,ty);
1087             fjz0             = _mm256_add_ps(fjz0,tz);
1088
1089             }
1090
1091             /**************************
1092              * CALCULATE INTERACTIONS *
1093              **************************/
1094
1095             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1096             {
1097
1098             r01              = _mm256_mul_ps(rsq01,rinv01);
1099             r01              = _mm256_andnot_ps(dummy_mask,r01);
1100
1101             /* EWALD ELECTROSTATICS */
1102             
1103             /* Analytical PME correction */
1104             zeta2            = _mm256_mul_ps(beta2,rsq01);
1105             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1106             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1107             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1108             felec            = _mm256_mul_ps(qq01,felec);
1109             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1110             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1111             velec            = _mm256_sub_ps(rinv01,pmecorrV);
1112             velec            = _mm256_mul_ps(qq01,velec);
1113             
1114             d                = _mm256_sub_ps(r01,rswitch);
1115             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1116             d2               = _mm256_mul_ps(d,d);
1117             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1118
1119             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1120
1121             /* Evaluate switch function */
1122             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1123             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
1124             velec            = _mm256_mul_ps(velec,sw);
1125             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1126
1127             /* Update potential sum for this i atom from the interaction with this j atom. */
1128             velec            = _mm256_and_ps(velec,cutoff_mask);
1129             velec            = _mm256_andnot_ps(dummy_mask,velec);
1130             velecsum         = _mm256_add_ps(velecsum,velec);
1131
1132             fscal            = felec;
1133
1134             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1135
1136             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1137
1138             /* Calculate temporary vectorial force */
1139             tx               = _mm256_mul_ps(fscal,dx01);
1140             ty               = _mm256_mul_ps(fscal,dy01);
1141             tz               = _mm256_mul_ps(fscal,dz01);
1142
1143             /* Update vectorial force */
1144             fix0             = _mm256_add_ps(fix0,tx);
1145             fiy0             = _mm256_add_ps(fiy0,ty);
1146             fiz0             = _mm256_add_ps(fiz0,tz);
1147
1148             fjx1             = _mm256_add_ps(fjx1,tx);
1149             fjy1             = _mm256_add_ps(fjy1,ty);
1150             fjz1             = _mm256_add_ps(fjz1,tz);
1151
1152             }
1153
1154             /**************************
1155              * CALCULATE INTERACTIONS *
1156              **************************/
1157
1158             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1159             {
1160
1161             r02              = _mm256_mul_ps(rsq02,rinv02);
1162             r02              = _mm256_andnot_ps(dummy_mask,r02);
1163
1164             /* EWALD ELECTROSTATICS */
1165             
1166             /* Analytical PME correction */
1167             zeta2            = _mm256_mul_ps(beta2,rsq02);
1168             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1169             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1170             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1171             felec            = _mm256_mul_ps(qq02,felec);
1172             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1173             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1174             velec            = _mm256_sub_ps(rinv02,pmecorrV);
1175             velec            = _mm256_mul_ps(qq02,velec);
1176             
1177             d                = _mm256_sub_ps(r02,rswitch);
1178             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1179             d2               = _mm256_mul_ps(d,d);
1180             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1181
1182             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1183
1184             /* Evaluate switch function */
1185             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1186             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
1187             velec            = _mm256_mul_ps(velec,sw);
1188             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1189
1190             /* Update potential sum for this i atom from the interaction with this j atom. */
1191             velec            = _mm256_and_ps(velec,cutoff_mask);
1192             velec            = _mm256_andnot_ps(dummy_mask,velec);
1193             velecsum         = _mm256_add_ps(velecsum,velec);
1194
1195             fscal            = felec;
1196
1197             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1198
1199             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1200
1201             /* Calculate temporary vectorial force */
1202             tx               = _mm256_mul_ps(fscal,dx02);
1203             ty               = _mm256_mul_ps(fscal,dy02);
1204             tz               = _mm256_mul_ps(fscal,dz02);
1205
1206             /* Update vectorial force */
1207             fix0             = _mm256_add_ps(fix0,tx);
1208             fiy0             = _mm256_add_ps(fiy0,ty);
1209             fiz0             = _mm256_add_ps(fiz0,tz);
1210
1211             fjx2             = _mm256_add_ps(fjx2,tx);
1212             fjy2             = _mm256_add_ps(fjy2,ty);
1213             fjz2             = _mm256_add_ps(fjz2,tz);
1214
1215             }
1216
1217             /**************************
1218              * CALCULATE INTERACTIONS *
1219              **************************/
1220
1221             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1222             {
1223
1224             r10              = _mm256_mul_ps(rsq10,rinv10);
1225             r10              = _mm256_andnot_ps(dummy_mask,r10);
1226
1227             /* EWALD ELECTROSTATICS */
1228             
1229             /* Analytical PME correction */
1230             zeta2            = _mm256_mul_ps(beta2,rsq10);
1231             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1232             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1233             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1234             felec            = _mm256_mul_ps(qq10,felec);
1235             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1236             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1237             velec            = _mm256_sub_ps(rinv10,pmecorrV);
1238             velec            = _mm256_mul_ps(qq10,velec);
1239             
1240             d                = _mm256_sub_ps(r10,rswitch);
1241             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1242             d2               = _mm256_mul_ps(d,d);
1243             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1244
1245             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1246
1247             /* Evaluate switch function */
1248             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1249             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1250             velec            = _mm256_mul_ps(velec,sw);
1251             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1252
1253             /* Update potential sum for this i atom from the interaction with this j atom. */
1254             velec            = _mm256_and_ps(velec,cutoff_mask);
1255             velec            = _mm256_andnot_ps(dummy_mask,velec);
1256             velecsum         = _mm256_add_ps(velecsum,velec);
1257
1258             fscal            = felec;
1259
1260             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1261
1262             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1263
1264             /* Calculate temporary vectorial force */
1265             tx               = _mm256_mul_ps(fscal,dx10);
1266             ty               = _mm256_mul_ps(fscal,dy10);
1267             tz               = _mm256_mul_ps(fscal,dz10);
1268
1269             /* Update vectorial force */
1270             fix1             = _mm256_add_ps(fix1,tx);
1271             fiy1             = _mm256_add_ps(fiy1,ty);
1272             fiz1             = _mm256_add_ps(fiz1,tz);
1273
1274             fjx0             = _mm256_add_ps(fjx0,tx);
1275             fjy0             = _mm256_add_ps(fjy0,ty);
1276             fjz0             = _mm256_add_ps(fjz0,tz);
1277
1278             }
1279
1280             /**************************
1281              * CALCULATE INTERACTIONS *
1282              **************************/
1283
1284             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1285             {
1286
1287             r11              = _mm256_mul_ps(rsq11,rinv11);
1288             r11              = _mm256_andnot_ps(dummy_mask,r11);
1289
1290             /* EWALD ELECTROSTATICS */
1291             
1292             /* Analytical PME correction */
1293             zeta2            = _mm256_mul_ps(beta2,rsq11);
1294             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1295             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1296             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1297             felec            = _mm256_mul_ps(qq11,felec);
1298             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1299             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1300             velec            = _mm256_sub_ps(rinv11,pmecorrV);
1301             velec            = _mm256_mul_ps(qq11,velec);
1302             
1303             d                = _mm256_sub_ps(r11,rswitch);
1304             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1305             d2               = _mm256_mul_ps(d,d);
1306             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1307
1308             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1309
1310             /* Evaluate switch function */
1311             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1312             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1313             velec            = _mm256_mul_ps(velec,sw);
1314             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1315
1316             /* Update potential sum for this i atom from the interaction with this j atom. */
1317             velec            = _mm256_and_ps(velec,cutoff_mask);
1318             velec            = _mm256_andnot_ps(dummy_mask,velec);
1319             velecsum         = _mm256_add_ps(velecsum,velec);
1320
1321             fscal            = felec;
1322
1323             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1324
1325             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1326
1327             /* Calculate temporary vectorial force */
1328             tx               = _mm256_mul_ps(fscal,dx11);
1329             ty               = _mm256_mul_ps(fscal,dy11);
1330             tz               = _mm256_mul_ps(fscal,dz11);
1331
1332             /* Update vectorial force */
1333             fix1             = _mm256_add_ps(fix1,tx);
1334             fiy1             = _mm256_add_ps(fiy1,ty);
1335             fiz1             = _mm256_add_ps(fiz1,tz);
1336
1337             fjx1             = _mm256_add_ps(fjx1,tx);
1338             fjy1             = _mm256_add_ps(fjy1,ty);
1339             fjz1             = _mm256_add_ps(fjz1,tz);
1340
1341             }
1342
1343             /**************************
1344              * CALCULATE INTERACTIONS *
1345              **************************/
1346
1347             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1348             {
1349
1350             r12              = _mm256_mul_ps(rsq12,rinv12);
1351             r12              = _mm256_andnot_ps(dummy_mask,r12);
1352
1353             /* EWALD ELECTROSTATICS */
1354             
1355             /* Analytical PME correction */
1356             zeta2            = _mm256_mul_ps(beta2,rsq12);
1357             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1358             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1359             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1360             felec            = _mm256_mul_ps(qq12,felec);
1361             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1362             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1363             velec            = _mm256_sub_ps(rinv12,pmecorrV);
1364             velec            = _mm256_mul_ps(qq12,velec);
1365             
1366             d                = _mm256_sub_ps(r12,rswitch);
1367             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1368             d2               = _mm256_mul_ps(d,d);
1369             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1370
1371             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1372
1373             /* Evaluate switch function */
1374             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1375             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1376             velec            = _mm256_mul_ps(velec,sw);
1377             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1378
1379             /* Update potential sum for this i atom from the interaction with this j atom. */
1380             velec            = _mm256_and_ps(velec,cutoff_mask);
1381             velec            = _mm256_andnot_ps(dummy_mask,velec);
1382             velecsum         = _mm256_add_ps(velecsum,velec);
1383
1384             fscal            = felec;
1385
1386             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1387
1388             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1389
1390             /* Calculate temporary vectorial force */
1391             tx               = _mm256_mul_ps(fscal,dx12);
1392             ty               = _mm256_mul_ps(fscal,dy12);
1393             tz               = _mm256_mul_ps(fscal,dz12);
1394
1395             /* Update vectorial force */
1396             fix1             = _mm256_add_ps(fix1,tx);
1397             fiy1             = _mm256_add_ps(fiy1,ty);
1398             fiz1             = _mm256_add_ps(fiz1,tz);
1399
1400             fjx2             = _mm256_add_ps(fjx2,tx);
1401             fjy2             = _mm256_add_ps(fjy2,ty);
1402             fjz2             = _mm256_add_ps(fjz2,tz);
1403
1404             }
1405
1406             /**************************
1407              * CALCULATE INTERACTIONS *
1408              **************************/
1409
1410             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1411             {
1412
1413             r20              = _mm256_mul_ps(rsq20,rinv20);
1414             r20              = _mm256_andnot_ps(dummy_mask,r20);
1415
1416             /* EWALD ELECTROSTATICS */
1417             
1418             /* Analytical PME correction */
1419             zeta2            = _mm256_mul_ps(beta2,rsq20);
1420             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1421             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1422             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1423             felec            = _mm256_mul_ps(qq20,felec);
1424             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1425             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1426             velec            = _mm256_sub_ps(rinv20,pmecorrV);
1427             velec            = _mm256_mul_ps(qq20,velec);
1428             
1429             d                = _mm256_sub_ps(r20,rswitch);
1430             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1431             d2               = _mm256_mul_ps(d,d);
1432             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1433
1434             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1435
1436             /* Evaluate switch function */
1437             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1438             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1439             velec            = _mm256_mul_ps(velec,sw);
1440             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1441
1442             /* Update potential sum for this i atom from the interaction with this j atom. */
1443             velec            = _mm256_and_ps(velec,cutoff_mask);
1444             velec            = _mm256_andnot_ps(dummy_mask,velec);
1445             velecsum         = _mm256_add_ps(velecsum,velec);
1446
1447             fscal            = felec;
1448
1449             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1450
1451             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1452
1453             /* Calculate temporary vectorial force */
1454             tx               = _mm256_mul_ps(fscal,dx20);
1455             ty               = _mm256_mul_ps(fscal,dy20);
1456             tz               = _mm256_mul_ps(fscal,dz20);
1457
1458             /* Update vectorial force */
1459             fix2             = _mm256_add_ps(fix2,tx);
1460             fiy2             = _mm256_add_ps(fiy2,ty);
1461             fiz2             = _mm256_add_ps(fiz2,tz);
1462
1463             fjx0             = _mm256_add_ps(fjx0,tx);
1464             fjy0             = _mm256_add_ps(fjy0,ty);
1465             fjz0             = _mm256_add_ps(fjz0,tz);
1466
1467             }
1468
1469             /**************************
1470              * CALCULATE INTERACTIONS *
1471              **************************/
1472
1473             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1474             {
1475
1476             r21              = _mm256_mul_ps(rsq21,rinv21);
1477             r21              = _mm256_andnot_ps(dummy_mask,r21);
1478
1479             /* EWALD ELECTROSTATICS */
1480             
1481             /* Analytical PME correction */
1482             zeta2            = _mm256_mul_ps(beta2,rsq21);
1483             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1484             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1485             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1486             felec            = _mm256_mul_ps(qq21,felec);
1487             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1488             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1489             velec            = _mm256_sub_ps(rinv21,pmecorrV);
1490             velec            = _mm256_mul_ps(qq21,velec);
1491             
1492             d                = _mm256_sub_ps(r21,rswitch);
1493             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1494             d2               = _mm256_mul_ps(d,d);
1495             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1496
1497             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1498
1499             /* Evaluate switch function */
1500             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1501             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1502             velec            = _mm256_mul_ps(velec,sw);
1503             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1504
1505             /* Update potential sum for this i atom from the interaction with this j atom. */
1506             velec            = _mm256_and_ps(velec,cutoff_mask);
1507             velec            = _mm256_andnot_ps(dummy_mask,velec);
1508             velecsum         = _mm256_add_ps(velecsum,velec);
1509
1510             fscal            = felec;
1511
1512             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1513
1514             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1515
1516             /* Calculate temporary vectorial force */
1517             tx               = _mm256_mul_ps(fscal,dx21);
1518             ty               = _mm256_mul_ps(fscal,dy21);
1519             tz               = _mm256_mul_ps(fscal,dz21);
1520
1521             /* Update vectorial force */
1522             fix2             = _mm256_add_ps(fix2,tx);
1523             fiy2             = _mm256_add_ps(fiy2,ty);
1524             fiz2             = _mm256_add_ps(fiz2,tz);
1525
1526             fjx1             = _mm256_add_ps(fjx1,tx);
1527             fjy1             = _mm256_add_ps(fjy1,ty);
1528             fjz1             = _mm256_add_ps(fjz1,tz);
1529
1530             }
1531
1532             /**************************
1533              * CALCULATE INTERACTIONS *
1534              **************************/
1535
1536             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1537             {
1538
1539             r22              = _mm256_mul_ps(rsq22,rinv22);
1540             r22              = _mm256_andnot_ps(dummy_mask,r22);
1541
1542             /* EWALD ELECTROSTATICS */
1543             
1544             /* Analytical PME correction */
1545             zeta2            = _mm256_mul_ps(beta2,rsq22);
1546             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1547             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1548             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1549             felec            = _mm256_mul_ps(qq22,felec);
1550             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1551             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1552             velec            = _mm256_sub_ps(rinv22,pmecorrV);
1553             velec            = _mm256_mul_ps(qq22,velec);
1554             
1555             d                = _mm256_sub_ps(r22,rswitch);
1556             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1557             d2               = _mm256_mul_ps(d,d);
1558             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1559
1560             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1561
1562             /* Evaluate switch function */
1563             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1564             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1565             velec            = _mm256_mul_ps(velec,sw);
1566             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1567
1568             /* Update potential sum for this i atom from the interaction with this j atom. */
1569             velec            = _mm256_and_ps(velec,cutoff_mask);
1570             velec            = _mm256_andnot_ps(dummy_mask,velec);
1571             velecsum         = _mm256_add_ps(velecsum,velec);
1572
1573             fscal            = felec;
1574
1575             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1576
1577             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1578
1579             /* Calculate temporary vectorial force */
1580             tx               = _mm256_mul_ps(fscal,dx22);
1581             ty               = _mm256_mul_ps(fscal,dy22);
1582             tz               = _mm256_mul_ps(fscal,dz22);
1583
1584             /* Update vectorial force */
1585             fix2             = _mm256_add_ps(fix2,tx);
1586             fiy2             = _mm256_add_ps(fiy2,ty);
1587             fiz2             = _mm256_add_ps(fiz2,tz);
1588
1589             fjx2             = _mm256_add_ps(fjx2,tx);
1590             fjy2             = _mm256_add_ps(fjy2,ty);
1591             fjz2             = _mm256_add_ps(fjz2,tz);
1592
1593             }
1594
1595             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1596             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1597             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1598             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1599             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1600             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1601             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1602             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1603
1604             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1605                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1606
1607             /* Inner loop uses 999 flops */
1608         }
1609
1610         /* End of innermost loop */
1611
1612         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1613                                                  f+i_coord_offset,fshift+i_shift_offset);
1614
1615         ggid                        = gid[iidx];
1616         /* Update potential energies */
1617         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1618         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1619
1620         /* Increment number of inner iterations */
1621         inneriter                  += j_index_end - j_index_start;
1622
1623         /* Outer loop uses 20 flops */
1624     }
1625
1626     /* Increment number of outer iterations */
1627     outeriter        += nri;
1628
1629     /* Update outer/inner flops */
1630
1631     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*999);
1632 }
1633 /*
1634  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_single
1635  * Electrostatics interaction: Ewald
1636  * VdW interaction:            LennardJones
1637  * Geometry:                   Water3-Water3
1638  * Calculate force/pot:        Force
1639  */
1640 void
1641 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_single
1642                     (t_nblist                    * gmx_restrict       nlist,
1643                      rvec                        * gmx_restrict          xx,
1644                      rvec                        * gmx_restrict          ff,
1645                      t_forcerec                  * gmx_restrict          fr,
1646                      t_mdatoms                   * gmx_restrict     mdatoms,
1647                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1648                      t_nrnb                      * gmx_restrict        nrnb)
1649 {
1650     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1651      * just 0 for non-waters.
1652      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1653      * jnr indices corresponding to data put in the four positions in the SIMD register.
1654      */
1655     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1656     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1657     int              jnrA,jnrB,jnrC,jnrD;
1658     int              jnrE,jnrF,jnrG,jnrH;
1659     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1660     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1661     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1662     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1663     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1664     real             rcutoff_scalar;
1665     real             *shiftvec,*fshift,*x,*f;
1666     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1667     real             scratch[4*DIM];
1668     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1669     real *           vdwioffsetptr0;
1670     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1671     real *           vdwioffsetptr1;
1672     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1673     real *           vdwioffsetptr2;
1674     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1675     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1676     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1677     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1678     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1679     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1680     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1681     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1682     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1683     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1684     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1685     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1686     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1687     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1688     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1689     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1690     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1691     real             *charge;
1692     int              nvdwtype;
1693     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1694     int              *vdwtype;
1695     real             *vdwparam;
1696     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1697     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1698     __m256i          ewitab;
1699     __m128i          ewitab_lo,ewitab_hi;
1700     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1701     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1702     real             *ewtab;
1703     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1704     real             rswitch_scalar,d_scalar;
1705     __m256           dummy_mask,cutoff_mask;
1706     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1707     __m256           one     = _mm256_set1_ps(1.0);
1708     __m256           two     = _mm256_set1_ps(2.0);
1709     x                = xx[0];
1710     f                = ff[0];
1711
1712     nri              = nlist->nri;
1713     iinr             = nlist->iinr;
1714     jindex           = nlist->jindex;
1715     jjnr             = nlist->jjnr;
1716     shiftidx         = nlist->shift;
1717     gid              = nlist->gid;
1718     shiftvec         = fr->shift_vec[0];
1719     fshift           = fr->fshift[0];
1720     facel            = _mm256_set1_ps(fr->epsfac);
1721     charge           = mdatoms->chargeA;
1722     nvdwtype         = fr->ntype;
1723     vdwparam         = fr->nbfp;
1724     vdwtype          = mdatoms->typeA;
1725
1726     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1727     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
1728     beta2            = _mm256_mul_ps(beta,beta);
1729     beta3            = _mm256_mul_ps(beta,beta2);
1730
1731     ewtab            = fr->ic->tabq_coul_FDV0;
1732     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1733     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1734
1735     /* Setup water-specific parameters */
1736     inr              = nlist->iinr[0];
1737     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1738     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1739     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1740     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1741
1742     jq0              = _mm256_set1_ps(charge[inr+0]);
1743     jq1              = _mm256_set1_ps(charge[inr+1]);
1744     jq2              = _mm256_set1_ps(charge[inr+2]);
1745     vdwjidx0A        = 2*vdwtype[inr+0];
1746     qq00             = _mm256_mul_ps(iq0,jq0);
1747     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1748     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1749     qq01             = _mm256_mul_ps(iq0,jq1);
1750     qq02             = _mm256_mul_ps(iq0,jq2);
1751     qq10             = _mm256_mul_ps(iq1,jq0);
1752     qq11             = _mm256_mul_ps(iq1,jq1);
1753     qq12             = _mm256_mul_ps(iq1,jq2);
1754     qq20             = _mm256_mul_ps(iq2,jq0);
1755     qq21             = _mm256_mul_ps(iq2,jq1);
1756     qq22             = _mm256_mul_ps(iq2,jq2);
1757
1758     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1759     rcutoff_scalar   = fr->rcoulomb;
1760     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1761     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1762
1763     rswitch_scalar   = fr->rcoulomb_switch;
1764     rswitch          = _mm256_set1_ps(rswitch_scalar);
1765     /* Setup switch parameters */
1766     d_scalar         = rcutoff_scalar-rswitch_scalar;
1767     d                = _mm256_set1_ps(d_scalar);
1768     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1769     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1770     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1771     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1772     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1773     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1774
1775     /* Avoid stupid compiler warnings */
1776     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1777     j_coord_offsetA = 0;
1778     j_coord_offsetB = 0;
1779     j_coord_offsetC = 0;
1780     j_coord_offsetD = 0;
1781     j_coord_offsetE = 0;
1782     j_coord_offsetF = 0;
1783     j_coord_offsetG = 0;
1784     j_coord_offsetH = 0;
1785
1786     outeriter        = 0;
1787     inneriter        = 0;
1788
1789     for(iidx=0;iidx<4*DIM;iidx++)
1790     {
1791         scratch[iidx] = 0.0;
1792     }
1793
1794     /* Start outer loop over neighborlists */
1795     for(iidx=0; iidx<nri; iidx++)
1796     {
1797         /* Load shift vector for this list */
1798         i_shift_offset   = DIM*shiftidx[iidx];
1799
1800         /* Load limits for loop over neighbors */
1801         j_index_start    = jindex[iidx];
1802         j_index_end      = jindex[iidx+1];
1803
1804         /* Get outer coordinate index */
1805         inr              = iinr[iidx];
1806         i_coord_offset   = DIM*inr;
1807
1808         /* Load i particle coords and add shift vector */
1809         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1810                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1811
1812         fix0             = _mm256_setzero_ps();
1813         fiy0             = _mm256_setzero_ps();
1814         fiz0             = _mm256_setzero_ps();
1815         fix1             = _mm256_setzero_ps();
1816         fiy1             = _mm256_setzero_ps();
1817         fiz1             = _mm256_setzero_ps();
1818         fix2             = _mm256_setzero_ps();
1819         fiy2             = _mm256_setzero_ps();
1820         fiz2             = _mm256_setzero_ps();
1821
1822         /* Start inner kernel loop */
1823         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1824         {
1825
1826             /* Get j neighbor index, and coordinate index */
1827             jnrA             = jjnr[jidx];
1828             jnrB             = jjnr[jidx+1];
1829             jnrC             = jjnr[jidx+2];
1830             jnrD             = jjnr[jidx+3];
1831             jnrE             = jjnr[jidx+4];
1832             jnrF             = jjnr[jidx+5];
1833             jnrG             = jjnr[jidx+6];
1834             jnrH             = jjnr[jidx+7];
1835             j_coord_offsetA  = DIM*jnrA;
1836             j_coord_offsetB  = DIM*jnrB;
1837             j_coord_offsetC  = DIM*jnrC;
1838             j_coord_offsetD  = DIM*jnrD;
1839             j_coord_offsetE  = DIM*jnrE;
1840             j_coord_offsetF  = DIM*jnrF;
1841             j_coord_offsetG  = DIM*jnrG;
1842             j_coord_offsetH  = DIM*jnrH;
1843
1844             /* load j atom coordinates */
1845             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1846                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1847                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1848                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1849                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1850
1851             /* Calculate displacement vector */
1852             dx00             = _mm256_sub_ps(ix0,jx0);
1853             dy00             = _mm256_sub_ps(iy0,jy0);
1854             dz00             = _mm256_sub_ps(iz0,jz0);
1855             dx01             = _mm256_sub_ps(ix0,jx1);
1856             dy01             = _mm256_sub_ps(iy0,jy1);
1857             dz01             = _mm256_sub_ps(iz0,jz1);
1858             dx02             = _mm256_sub_ps(ix0,jx2);
1859             dy02             = _mm256_sub_ps(iy0,jy2);
1860             dz02             = _mm256_sub_ps(iz0,jz2);
1861             dx10             = _mm256_sub_ps(ix1,jx0);
1862             dy10             = _mm256_sub_ps(iy1,jy0);
1863             dz10             = _mm256_sub_ps(iz1,jz0);
1864             dx11             = _mm256_sub_ps(ix1,jx1);
1865             dy11             = _mm256_sub_ps(iy1,jy1);
1866             dz11             = _mm256_sub_ps(iz1,jz1);
1867             dx12             = _mm256_sub_ps(ix1,jx2);
1868             dy12             = _mm256_sub_ps(iy1,jy2);
1869             dz12             = _mm256_sub_ps(iz1,jz2);
1870             dx20             = _mm256_sub_ps(ix2,jx0);
1871             dy20             = _mm256_sub_ps(iy2,jy0);
1872             dz20             = _mm256_sub_ps(iz2,jz0);
1873             dx21             = _mm256_sub_ps(ix2,jx1);
1874             dy21             = _mm256_sub_ps(iy2,jy1);
1875             dz21             = _mm256_sub_ps(iz2,jz1);
1876             dx22             = _mm256_sub_ps(ix2,jx2);
1877             dy22             = _mm256_sub_ps(iy2,jy2);
1878             dz22             = _mm256_sub_ps(iz2,jz2);
1879
1880             /* Calculate squared distance and things based on it */
1881             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1882             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1883             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1884             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1885             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1886             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1887             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1888             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1889             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1890
1891             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1892             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1893             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1894             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1895             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1896             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1897             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1898             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1899             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1900
1901             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1902             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1903             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1904             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1905             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1906             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1907             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1908             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1909             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1910
1911             fjx0             = _mm256_setzero_ps();
1912             fjy0             = _mm256_setzero_ps();
1913             fjz0             = _mm256_setzero_ps();
1914             fjx1             = _mm256_setzero_ps();
1915             fjy1             = _mm256_setzero_ps();
1916             fjz1             = _mm256_setzero_ps();
1917             fjx2             = _mm256_setzero_ps();
1918             fjy2             = _mm256_setzero_ps();
1919             fjz2             = _mm256_setzero_ps();
1920
1921             /**************************
1922              * CALCULATE INTERACTIONS *
1923              **************************/
1924
1925             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1926             {
1927
1928             r00              = _mm256_mul_ps(rsq00,rinv00);
1929
1930             /* EWALD ELECTROSTATICS */
1931             
1932             /* Analytical PME correction */
1933             zeta2            = _mm256_mul_ps(beta2,rsq00);
1934             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1935             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1936             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1937             felec            = _mm256_mul_ps(qq00,felec);
1938             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1939             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1940             velec            = _mm256_sub_ps(rinv00,pmecorrV);
1941             velec            = _mm256_mul_ps(qq00,velec);
1942             
1943             /* LENNARD-JONES DISPERSION/REPULSION */
1944
1945             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1946             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
1947             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1948             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1949             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1950
1951             d                = _mm256_sub_ps(r00,rswitch);
1952             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1953             d2               = _mm256_mul_ps(d,d);
1954             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1955
1956             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1957
1958             /* Evaluate switch function */
1959             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1960             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1961             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1962             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1963
1964             fscal            = _mm256_add_ps(felec,fvdw);
1965
1966             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1967
1968             /* Calculate temporary vectorial force */
1969             tx               = _mm256_mul_ps(fscal,dx00);
1970             ty               = _mm256_mul_ps(fscal,dy00);
1971             tz               = _mm256_mul_ps(fscal,dz00);
1972
1973             /* Update vectorial force */
1974             fix0             = _mm256_add_ps(fix0,tx);
1975             fiy0             = _mm256_add_ps(fiy0,ty);
1976             fiz0             = _mm256_add_ps(fiz0,tz);
1977
1978             fjx0             = _mm256_add_ps(fjx0,tx);
1979             fjy0             = _mm256_add_ps(fjy0,ty);
1980             fjz0             = _mm256_add_ps(fjz0,tz);
1981
1982             }
1983
1984             /**************************
1985              * CALCULATE INTERACTIONS *
1986              **************************/
1987
1988             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1989             {
1990
1991             r01              = _mm256_mul_ps(rsq01,rinv01);
1992
1993             /* EWALD ELECTROSTATICS */
1994             
1995             /* Analytical PME correction */
1996             zeta2            = _mm256_mul_ps(beta2,rsq01);
1997             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1998             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1999             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2000             felec            = _mm256_mul_ps(qq01,felec);
2001             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2002             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2003             velec            = _mm256_sub_ps(rinv01,pmecorrV);
2004             velec            = _mm256_mul_ps(qq01,velec);
2005             
2006             d                = _mm256_sub_ps(r01,rswitch);
2007             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2008             d2               = _mm256_mul_ps(d,d);
2009             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2010
2011             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2012
2013             /* Evaluate switch function */
2014             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2015             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
2016             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2017
2018             fscal            = felec;
2019
2020             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2021
2022             /* Calculate temporary vectorial force */
2023             tx               = _mm256_mul_ps(fscal,dx01);
2024             ty               = _mm256_mul_ps(fscal,dy01);
2025             tz               = _mm256_mul_ps(fscal,dz01);
2026
2027             /* Update vectorial force */
2028             fix0             = _mm256_add_ps(fix0,tx);
2029             fiy0             = _mm256_add_ps(fiy0,ty);
2030             fiz0             = _mm256_add_ps(fiz0,tz);
2031
2032             fjx1             = _mm256_add_ps(fjx1,tx);
2033             fjy1             = _mm256_add_ps(fjy1,ty);
2034             fjz1             = _mm256_add_ps(fjz1,tz);
2035
2036             }
2037
2038             /**************************
2039              * CALCULATE INTERACTIONS *
2040              **************************/
2041
2042             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2043             {
2044
2045             r02              = _mm256_mul_ps(rsq02,rinv02);
2046
2047             /* EWALD ELECTROSTATICS */
2048             
2049             /* Analytical PME correction */
2050             zeta2            = _mm256_mul_ps(beta2,rsq02);
2051             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
2052             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2053             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2054             felec            = _mm256_mul_ps(qq02,felec);
2055             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2056             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2057             velec            = _mm256_sub_ps(rinv02,pmecorrV);
2058             velec            = _mm256_mul_ps(qq02,velec);
2059             
2060             d                = _mm256_sub_ps(r02,rswitch);
2061             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2062             d2               = _mm256_mul_ps(d,d);
2063             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2064
2065             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2066
2067             /* Evaluate switch function */
2068             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2069             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
2070             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2071
2072             fscal            = felec;
2073
2074             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2075
2076             /* Calculate temporary vectorial force */
2077             tx               = _mm256_mul_ps(fscal,dx02);
2078             ty               = _mm256_mul_ps(fscal,dy02);
2079             tz               = _mm256_mul_ps(fscal,dz02);
2080
2081             /* Update vectorial force */
2082             fix0             = _mm256_add_ps(fix0,tx);
2083             fiy0             = _mm256_add_ps(fiy0,ty);
2084             fiz0             = _mm256_add_ps(fiz0,tz);
2085
2086             fjx2             = _mm256_add_ps(fjx2,tx);
2087             fjy2             = _mm256_add_ps(fjy2,ty);
2088             fjz2             = _mm256_add_ps(fjz2,tz);
2089
2090             }
2091
2092             /**************************
2093              * CALCULATE INTERACTIONS *
2094              **************************/
2095
2096             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2097             {
2098
2099             r10              = _mm256_mul_ps(rsq10,rinv10);
2100
2101             /* EWALD ELECTROSTATICS */
2102             
2103             /* Analytical PME correction */
2104             zeta2            = _mm256_mul_ps(beta2,rsq10);
2105             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2106             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2107             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2108             felec            = _mm256_mul_ps(qq10,felec);
2109             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2110             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2111             velec            = _mm256_sub_ps(rinv10,pmecorrV);
2112             velec            = _mm256_mul_ps(qq10,velec);
2113             
2114             d                = _mm256_sub_ps(r10,rswitch);
2115             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2116             d2               = _mm256_mul_ps(d,d);
2117             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2118
2119             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2120
2121             /* Evaluate switch function */
2122             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2123             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2124             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2125
2126             fscal            = felec;
2127
2128             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2129
2130             /* Calculate temporary vectorial force */
2131             tx               = _mm256_mul_ps(fscal,dx10);
2132             ty               = _mm256_mul_ps(fscal,dy10);
2133             tz               = _mm256_mul_ps(fscal,dz10);
2134
2135             /* Update vectorial force */
2136             fix1             = _mm256_add_ps(fix1,tx);
2137             fiy1             = _mm256_add_ps(fiy1,ty);
2138             fiz1             = _mm256_add_ps(fiz1,tz);
2139
2140             fjx0             = _mm256_add_ps(fjx0,tx);
2141             fjy0             = _mm256_add_ps(fjy0,ty);
2142             fjz0             = _mm256_add_ps(fjz0,tz);
2143
2144             }
2145
2146             /**************************
2147              * CALCULATE INTERACTIONS *
2148              **************************/
2149
2150             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2151             {
2152
2153             r11              = _mm256_mul_ps(rsq11,rinv11);
2154
2155             /* EWALD ELECTROSTATICS */
2156             
2157             /* Analytical PME correction */
2158             zeta2            = _mm256_mul_ps(beta2,rsq11);
2159             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2160             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2161             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2162             felec            = _mm256_mul_ps(qq11,felec);
2163             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2164             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2165             velec            = _mm256_sub_ps(rinv11,pmecorrV);
2166             velec            = _mm256_mul_ps(qq11,velec);
2167             
2168             d                = _mm256_sub_ps(r11,rswitch);
2169             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2170             d2               = _mm256_mul_ps(d,d);
2171             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2172
2173             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2174
2175             /* Evaluate switch function */
2176             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2177             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2178             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2179
2180             fscal            = felec;
2181
2182             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2183
2184             /* Calculate temporary vectorial force */
2185             tx               = _mm256_mul_ps(fscal,dx11);
2186             ty               = _mm256_mul_ps(fscal,dy11);
2187             tz               = _mm256_mul_ps(fscal,dz11);
2188
2189             /* Update vectorial force */
2190             fix1             = _mm256_add_ps(fix1,tx);
2191             fiy1             = _mm256_add_ps(fiy1,ty);
2192             fiz1             = _mm256_add_ps(fiz1,tz);
2193
2194             fjx1             = _mm256_add_ps(fjx1,tx);
2195             fjy1             = _mm256_add_ps(fjy1,ty);
2196             fjz1             = _mm256_add_ps(fjz1,tz);
2197
2198             }
2199
2200             /**************************
2201              * CALCULATE INTERACTIONS *
2202              **************************/
2203
2204             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2205             {
2206
2207             r12              = _mm256_mul_ps(rsq12,rinv12);
2208
2209             /* EWALD ELECTROSTATICS */
2210             
2211             /* Analytical PME correction */
2212             zeta2            = _mm256_mul_ps(beta2,rsq12);
2213             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2214             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2215             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2216             felec            = _mm256_mul_ps(qq12,felec);
2217             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2218             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2219             velec            = _mm256_sub_ps(rinv12,pmecorrV);
2220             velec            = _mm256_mul_ps(qq12,velec);
2221             
2222             d                = _mm256_sub_ps(r12,rswitch);
2223             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2224             d2               = _mm256_mul_ps(d,d);
2225             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2226
2227             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2228
2229             /* Evaluate switch function */
2230             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2231             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2232             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2233
2234             fscal            = felec;
2235
2236             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2237
2238             /* Calculate temporary vectorial force */
2239             tx               = _mm256_mul_ps(fscal,dx12);
2240             ty               = _mm256_mul_ps(fscal,dy12);
2241             tz               = _mm256_mul_ps(fscal,dz12);
2242
2243             /* Update vectorial force */
2244             fix1             = _mm256_add_ps(fix1,tx);
2245             fiy1             = _mm256_add_ps(fiy1,ty);
2246             fiz1             = _mm256_add_ps(fiz1,tz);
2247
2248             fjx2             = _mm256_add_ps(fjx2,tx);
2249             fjy2             = _mm256_add_ps(fjy2,ty);
2250             fjz2             = _mm256_add_ps(fjz2,tz);
2251
2252             }
2253
2254             /**************************
2255              * CALCULATE INTERACTIONS *
2256              **************************/
2257
2258             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2259             {
2260
2261             r20              = _mm256_mul_ps(rsq20,rinv20);
2262
2263             /* EWALD ELECTROSTATICS */
2264             
2265             /* Analytical PME correction */
2266             zeta2            = _mm256_mul_ps(beta2,rsq20);
2267             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2268             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2269             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2270             felec            = _mm256_mul_ps(qq20,felec);
2271             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2272             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2273             velec            = _mm256_sub_ps(rinv20,pmecorrV);
2274             velec            = _mm256_mul_ps(qq20,velec);
2275             
2276             d                = _mm256_sub_ps(r20,rswitch);
2277             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2278             d2               = _mm256_mul_ps(d,d);
2279             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2280
2281             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2282
2283             /* Evaluate switch function */
2284             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2285             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2286             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2287
2288             fscal            = felec;
2289
2290             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2291
2292             /* Calculate temporary vectorial force */
2293             tx               = _mm256_mul_ps(fscal,dx20);
2294             ty               = _mm256_mul_ps(fscal,dy20);
2295             tz               = _mm256_mul_ps(fscal,dz20);
2296
2297             /* Update vectorial force */
2298             fix2             = _mm256_add_ps(fix2,tx);
2299             fiy2             = _mm256_add_ps(fiy2,ty);
2300             fiz2             = _mm256_add_ps(fiz2,tz);
2301
2302             fjx0             = _mm256_add_ps(fjx0,tx);
2303             fjy0             = _mm256_add_ps(fjy0,ty);
2304             fjz0             = _mm256_add_ps(fjz0,tz);
2305
2306             }
2307
2308             /**************************
2309              * CALCULATE INTERACTIONS *
2310              **************************/
2311
2312             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2313             {
2314
2315             r21              = _mm256_mul_ps(rsq21,rinv21);
2316
2317             /* EWALD ELECTROSTATICS */
2318             
2319             /* Analytical PME correction */
2320             zeta2            = _mm256_mul_ps(beta2,rsq21);
2321             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2322             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2323             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2324             felec            = _mm256_mul_ps(qq21,felec);
2325             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2326             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2327             velec            = _mm256_sub_ps(rinv21,pmecorrV);
2328             velec            = _mm256_mul_ps(qq21,velec);
2329             
2330             d                = _mm256_sub_ps(r21,rswitch);
2331             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2332             d2               = _mm256_mul_ps(d,d);
2333             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2334
2335             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2336
2337             /* Evaluate switch function */
2338             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2339             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2340             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2341
2342             fscal            = felec;
2343
2344             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2345
2346             /* Calculate temporary vectorial force */
2347             tx               = _mm256_mul_ps(fscal,dx21);
2348             ty               = _mm256_mul_ps(fscal,dy21);
2349             tz               = _mm256_mul_ps(fscal,dz21);
2350
2351             /* Update vectorial force */
2352             fix2             = _mm256_add_ps(fix2,tx);
2353             fiy2             = _mm256_add_ps(fiy2,ty);
2354             fiz2             = _mm256_add_ps(fiz2,tz);
2355
2356             fjx1             = _mm256_add_ps(fjx1,tx);
2357             fjy1             = _mm256_add_ps(fjy1,ty);
2358             fjz1             = _mm256_add_ps(fjz1,tz);
2359
2360             }
2361
2362             /**************************
2363              * CALCULATE INTERACTIONS *
2364              **************************/
2365
2366             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2367             {
2368
2369             r22              = _mm256_mul_ps(rsq22,rinv22);
2370
2371             /* EWALD ELECTROSTATICS */
2372             
2373             /* Analytical PME correction */
2374             zeta2            = _mm256_mul_ps(beta2,rsq22);
2375             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2376             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2377             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2378             felec            = _mm256_mul_ps(qq22,felec);
2379             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2380             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2381             velec            = _mm256_sub_ps(rinv22,pmecorrV);
2382             velec            = _mm256_mul_ps(qq22,velec);
2383             
2384             d                = _mm256_sub_ps(r22,rswitch);
2385             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2386             d2               = _mm256_mul_ps(d,d);
2387             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2388
2389             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2390
2391             /* Evaluate switch function */
2392             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2393             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2394             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2395
2396             fscal            = felec;
2397
2398             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2399
2400             /* Calculate temporary vectorial force */
2401             tx               = _mm256_mul_ps(fscal,dx22);
2402             ty               = _mm256_mul_ps(fscal,dy22);
2403             tz               = _mm256_mul_ps(fscal,dz22);
2404
2405             /* Update vectorial force */
2406             fix2             = _mm256_add_ps(fix2,tx);
2407             fiy2             = _mm256_add_ps(fiy2,ty);
2408             fiz2             = _mm256_add_ps(fiz2,tz);
2409
2410             fjx2             = _mm256_add_ps(fjx2,tx);
2411             fjy2             = _mm256_add_ps(fjy2,ty);
2412             fjz2             = _mm256_add_ps(fjz2,tz);
2413
2414             }
2415
2416             fjptrA             = f+j_coord_offsetA;
2417             fjptrB             = f+j_coord_offsetB;
2418             fjptrC             = f+j_coord_offsetC;
2419             fjptrD             = f+j_coord_offsetD;
2420             fjptrE             = f+j_coord_offsetE;
2421             fjptrF             = f+j_coord_offsetF;
2422             fjptrG             = f+j_coord_offsetG;
2423             fjptrH             = f+j_coord_offsetH;
2424
2425             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2426                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2427
2428             /* Inner loop uses 960 flops */
2429         }
2430
2431         if(jidx<j_index_end)
2432         {
2433
2434             /* Get j neighbor index, and coordinate index */
2435             jnrlistA         = jjnr[jidx];
2436             jnrlistB         = jjnr[jidx+1];
2437             jnrlistC         = jjnr[jidx+2];
2438             jnrlistD         = jjnr[jidx+3];
2439             jnrlistE         = jjnr[jidx+4];
2440             jnrlistF         = jjnr[jidx+5];
2441             jnrlistG         = jjnr[jidx+6];
2442             jnrlistH         = jjnr[jidx+7];
2443             /* Sign of each element will be negative for non-real atoms.
2444              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2445              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2446              */
2447             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2448                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2449                                             
2450             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2451             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2452             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2453             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2454             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2455             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2456             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2457             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2458             j_coord_offsetA  = DIM*jnrA;
2459             j_coord_offsetB  = DIM*jnrB;
2460             j_coord_offsetC  = DIM*jnrC;
2461             j_coord_offsetD  = DIM*jnrD;
2462             j_coord_offsetE  = DIM*jnrE;
2463             j_coord_offsetF  = DIM*jnrF;
2464             j_coord_offsetG  = DIM*jnrG;
2465             j_coord_offsetH  = DIM*jnrH;
2466
2467             /* load j atom coordinates */
2468             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2469                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2470                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2471                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2472                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2473
2474             /* Calculate displacement vector */
2475             dx00             = _mm256_sub_ps(ix0,jx0);
2476             dy00             = _mm256_sub_ps(iy0,jy0);
2477             dz00             = _mm256_sub_ps(iz0,jz0);
2478             dx01             = _mm256_sub_ps(ix0,jx1);
2479             dy01             = _mm256_sub_ps(iy0,jy1);
2480             dz01             = _mm256_sub_ps(iz0,jz1);
2481             dx02             = _mm256_sub_ps(ix0,jx2);
2482             dy02             = _mm256_sub_ps(iy0,jy2);
2483             dz02             = _mm256_sub_ps(iz0,jz2);
2484             dx10             = _mm256_sub_ps(ix1,jx0);
2485             dy10             = _mm256_sub_ps(iy1,jy0);
2486             dz10             = _mm256_sub_ps(iz1,jz0);
2487             dx11             = _mm256_sub_ps(ix1,jx1);
2488             dy11             = _mm256_sub_ps(iy1,jy1);
2489             dz11             = _mm256_sub_ps(iz1,jz1);
2490             dx12             = _mm256_sub_ps(ix1,jx2);
2491             dy12             = _mm256_sub_ps(iy1,jy2);
2492             dz12             = _mm256_sub_ps(iz1,jz2);
2493             dx20             = _mm256_sub_ps(ix2,jx0);
2494             dy20             = _mm256_sub_ps(iy2,jy0);
2495             dz20             = _mm256_sub_ps(iz2,jz0);
2496             dx21             = _mm256_sub_ps(ix2,jx1);
2497             dy21             = _mm256_sub_ps(iy2,jy1);
2498             dz21             = _mm256_sub_ps(iz2,jz1);
2499             dx22             = _mm256_sub_ps(ix2,jx2);
2500             dy22             = _mm256_sub_ps(iy2,jy2);
2501             dz22             = _mm256_sub_ps(iz2,jz2);
2502
2503             /* Calculate squared distance and things based on it */
2504             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2505             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2506             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2507             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2508             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2509             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2510             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2511             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2512             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2513
2514             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2515             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
2516             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
2517             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
2518             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2519             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2520             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
2521             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2522             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2523
2524             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
2525             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
2526             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
2527             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
2528             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2529             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2530             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
2531             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2532             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2533
2534             fjx0             = _mm256_setzero_ps();
2535             fjy0             = _mm256_setzero_ps();
2536             fjz0             = _mm256_setzero_ps();
2537             fjx1             = _mm256_setzero_ps();
2538             fjy1             = _mm256_setzero_ps();
2539             fjz1             = _mm256_setzero_ps();
2540             fjx2             = _mm256_setzero_ps();
2541             fjy2             = _mm256_setzero_ps();
2542             fjz2             = _mm256_setzero_ps();
2543
2544             /**************************
2545              * CALCULATE INTERACTIONS *
2546              **************************/
2547
2548             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2549             {
2550
2551             r00              = _mm256_mul_ps(rsq00,rinv00);
2552             r00              = _mm256_andnot_ps(dummy_mask,r00);
2553
2554             /* EWALD ELECTROSTATICS */
2555             
2556             /* Analytical PME correction */
2557             zeta2            = _mm256_mul_ps(beta2,rsq00);
2558             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
2559             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2560             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2561             felec            = _mm256_mul_ps(qq00,felec);
2562             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2563             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2564             velec            = _mm256_sub_ps(rinv00,pmecorrV);
2565             velec            = _mm256_mul_ps(qq00,velec);
2566             
2567             /* LENNARD-JONES DISPERSION/REPULSION */
2568
2569             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2570             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
2571             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
2572             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
2573             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
2574
2575             d                = _mm256_sub_ps(r00,rswitch);
2576             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2577             d2               = _mm256_mul_ps(d,d);
2578             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2579
2580             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2581
2582             /* Evaluate switch function */
2583             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2584             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
2585             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
2586             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2587
2588             fscal            = _mm256_add_ps(felec,fvdw);
2589
2590             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2591
2592             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2593
2594             /* Calculate temporary vectorial force */
2595             tx               = _mm256_mul_ps(fscal,dx00);
2596             ty               = _mm256_mul_ps(fscal,dy00);
2597             tz               = _mm256_mul_ps(fscal,dz00);
2598
2599             /* Update vectorial force */
2600             fix0             = _mm256_add_ps(fix0,tx);
2601             fiy0             = _mm256_add_ps(fiy0,ty);
2602             fiz0             = _mm256_add_ps(fiz0,tz);
2603
2604             fjx0             = _mm256_add_ps(fjx0,tx);
2605             fjy0             = _mm256_add_ps(fjy0,ty);
2606             fjz0             = _mm256_add_ps(fjz0,tz);
2607
2608             }
2609
2610             /**************************
2611              * CALCULATE INTERACTIONS *
2612              **************************/
2613
2614             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2615             {
2616
2617             r01              = _mm256_mul_ps(rsq01,rinv01);
2618             r01              = _mm256_andnot_ps(dummy_mask,r01);
2619
2620             /* EWALD ELECTROSTATICS */
2621             
2622             /* Analytical PME correction */
2623             zeta2            = _mm256_mul_ps(beta2,rsq01);
2624             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
2625             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2626             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2627             felec            = _mm256_mul_ps(qq01,felec);
2628             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2629             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2630             velec            = _mm256_sub_ps(rinv01,pmecorrV);
2631             velec            = _mm256_mul_ps(qq01,velec);
2632             
2633             d                = _mm256_sub_ps(r01,rswitch);
2634             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2635             d2               = _mm256_mul_ps(d,d);
2636             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2637
2638             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2639
2640             /* Evaluate switch function */
2641             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2642             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
2643             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2644
2645             fscal            = felec;
2646
2647             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2648
2649             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2650
2651             /* Calculate temporary vectorial force */
2652             tx               = _mm256_mul_ps(fscal,dx01);
2653             ty               = _mm256_mul_ps(fscal,dy01);
2654             tz               = _mm256_mul_ps(fscal,dz01);
2655
2656             /* Update vectorial force */
2657             fix0             = _mm256_add_ps(fix0,tx);
2658             fiy0             = _mm256_add_ps(fiy0,ty);
2659             fiz0             = _mm256_add_ps(fiz0,tz);
2660
2661             fjx1             = _mm256_add_ps(fjx1,tx);
2662             fjy1             = _mm256_add_ps(fjy1,ty);
2663             fjz1             = _mm256_add_ps(fjz1,tz);
2664
2665             }
2666
2667             /**************************
2668              * CALCULATE INTERACTIONS *
2669              **************************/
2670
2671             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2672             {
2673
2674             r02              = _mm256_mul_ps(rsq02,rinv02);
2675             r02              = _mm256_andnot_ps(dummy_mask,r02);
2676
2677             /* EWALD ELECTROSTATICS */
2678             
2679             /* Analytical PME correction */
2680             zeta2            = _mm256_mul_ps(beta2,rsq02);
2681             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
2682             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2683             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2684             felec            = _mm256_mul_ps(qq02,felec);
2685             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2686             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2687             velec            = _mm256_sub_ps(rinv02,pmecorrV);
2688             velec            = _mm256_mul_ps(qq02,velec);
2689             
2690             d                = _mm256_sub_ps(r02,rswitch);
2691             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2692             d2               = _mm256_mul_ps(d,d);
2693             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2694
2695             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2696
2697             /* Evaluate switch function */
2698             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2699             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
2700             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2701
2702             fscal            = felec;
2703
2704             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2705
2706             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2707
2708             /* Calculate temporary vectorial force */
2709             tx               = _mm256_mul_ps(fscal,dx02);
2710             ty               = _mm256_mul_ps(fscal,dy02);
2711             tz               = _mm256_mul_ps(fscal,dz02);
2712
2713             /* Update vectorial force */
2714             fix0             = _mm256_add_ps(fix0,tx);
2715             fiy0             = _mm256_add_ps(fiy0,ty);
2716             fiz0             = _mm256_add_ps(fiz0,tz);
2717
2718             fjx2             = _mm256_add_ps(fjx2,tx);
2719             fjy2             = _mm256_add_ps(fjy2,ty);
2720             fjz2             = _mm256_add_ps(fjz2,tz);
2721
2722             }
2723
2724             /**************************
2725              * CALCULATE INTERACTIONS *
2726              **************************/
2727
2728             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2729             {
2730
2731             r10              = _mm256_mul_ps(rsq10,rinv10);
2732             r10              = _mm256_andnot_ps(dummy_mask,r10);
2733
2734             /* EWALD ELECTROSTATICS */
2735             
2736             /* Analytical PME correction */
2737             zeta2            = _mm256_mul_ps(beta2,rsq10);
2738             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2739             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2740             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2741             felec            = _mm256_mul_ps(qq10,felec);
2742             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2743             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2744             velec            = _mm256_sub_ps(rinv10,pmecorrV);
2745             velec            = _mm256_mul_ps(qq10,velec);
2746             
2747             d                = _mm256_sub_ps(r10,rswitch);
2748             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2749             d2               = _mm256_mul_ps(d,d);
2750             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2751
2752             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2753
2754             /* Evaluate switch function */
2755             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2756             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2757             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2758
2759             fscal            = felec;
2760
2761             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2762
2763             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2764
2765             /* Calculate temporary vectorial force */
2766             tx               = _mm256_mul_ps(fscal,dx10);
2767             ty               = _mm256_mul_ps(fscal,dy10);
2768             tz               = _mm256_mul_ps(fscal,dz10);
2769
2770             /* Update vectorial force */
2771             fix1             = _mm256_add_ps(fix1,tx);
2772             fiy1             = _mm256_add_ps(fiy1,ty);
2773             fiz1             = _mm256_add_ps(fiz1,tz);
2774
2775             fjx0             = _mm256_add_ps(fjx0,tx);
2776             fjy0             = _mm256_add_ps(fjy0,ty);
2777             fjz0             = _mm256_add_ps(fjz0,tz);
2778
2779             }
2780
2781             /**************************
2782              * CALCULATE INTERACTIONS *
2783              **************************/
2784
2785             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2786             {
2787
2788             r11              = _mm256_mul_ps(rsq11,rinv11);
2789             r11              = _mm256_andnot_ps(dummy_mask,r11);
2790
2791             /* EWALD ELECTROSTATICS */
2792             
2793             /* Analytical PME correction */
2794             zeta2            = _mm256_mul_ps(beta2,rsq11);
2795             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2796             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2797             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2798             felec            = _mm256_mul_ps(qq11,felec);
2799             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2800             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2801             velec            = _mm256_sub_ps(rinv11,pmecorrV);
2802             velec            = _mm256_mul_ps(qq11,velec);
2803             
2804             d                = _mm256_sub_ps(r11,rswitch);
2805             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2806             d2               = _mm256_mul_ps(d,d);
2807             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2808
2809             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2810
2811             /* Evaluate switch function */
2812             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2813             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2814             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2815
2816             fscal            = felec;
2817
2818             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2819
2820             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2821
2822             /* Calculate temporary vectorial force */
2823             tx               = _mm256_mul_ps(fscal,dx11);
2824             ty               = _mm256_mul_ps(fscal,dy11);
2825             tz               = _mm256_mul_ps(fscal,dz11);
2826
2827             /* Update vectorial force */
2828             fix1             = _mm256_add_ps(fix1,tx);
2829             fiy1             = _mm256_add_ps(fiy1,ty);
2830             fiz1             = _mm256_add_ps(fiz1,tz);
2831
2832             fjx1             = _mm256_add_ps(fjx1,tx);
2833             fjy1             = _mm256_add_ps(fjy1,ty);
2834             fjz1             = _mm256_add_ps(fjz1,tz);
2835
2836             }
2837
2838             /**************************
2839              * CALCULATE INTERACTIONS *
2840              **************************/
2841
2842             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2843             {
2844
2845             r12              = _mm256_mul_ps(rsq12,rinv12);
2846             r12              = _mm256_andnot_ps(dummy_mask,r12);
2847
2848             /* EWALD ELECTROSTATICS */
2849             
2850             /* Analytical PME correction */
2851             zeta2            = _mm256_mul_ps(beta2,rsq12);
2852             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2853             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2854             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2855             felec            = _mm256_mul_ps(qq12,felec);
2856             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2857             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2858             velec            = _mm256_sub_ps(rinv12,pmecorrV);
2859             velec            = _mm256_mul_ps(qq12,velec);
2860             
2861             d                = _mm256_sub_ps(r12,rswitch);
2862             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2863             d2               = _mm256_mul_ps(d,d);
2864             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2865
2866             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2867
2868             /* Evaluate switch function */
2869             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2870             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2871             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2872
2873             fscal            = felec;
2874
2875             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2876
2877             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2878
2879             /* Calculate temporary vectorial force */
2880             tx               = _mm256_mul_ps(fscal,dx12);
2881             ty               = _mm256_mul_ps(fscal,dy12);
2882             tz               = _mm256_mul_ps(fscal,dz12);
2883
2884             /* Update vectorial force */
2885             fix1             = _mm256_add_ps(fix1,tx);
2886             fiy1             = _mm256_add_ps(fiy1,ty);
2887             fiz1             = _mm256_add_ps(fiz1,tz);
2888
2889             fjx2             = _mm256_add_ps(fjx2,tx);
2890             fjy2             = _mm256_add_ps(fjy2,ty);
2891             fjz2             = _mm256_add_ps(fjz2,tz);
2892
2893             }
2894
2895             /**************************
2896              * CALCULATE INTERACTIONS *
2897              **************************/
2898
2899             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2900             {
2901
2902             r20              = _mm256_mul_ps(rsq20,rinv20);
2903             r20              = _mm256_andnot_ps(dummy_mask,r20);
2904
2905             /* EWALD ELECTROSTATICS */
2906             
2907             /* Analytical PME correction */
2908             zeta2            = _mm256_mul_ps(beta2,rsq20);
2909             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2910             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2911             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2912             felec            = _mm256_mul_ps(qq20,felec);
2913             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2914             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2915             velec            = _mm256_sub_ps(rinv20,pmecorrV);
2916             velec            = _mm256_mul_ps(qq20,velec);
2917             
2918             d                = _mm256_sub_ps(r20,rswitch);
2919             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2920             d2               = _mm256_mul_ps(d,d);
2921             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2922
2923             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2924
2925             /* Evaluate switch function */
2926             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2927             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2928             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2929
2930             fscal            = felec;
2931
2932             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2933
2934             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2935
2936             /* Calculate temporary vectorial force */
2937             tx               = _mm256_mul_ps(fscal,dx20);
2938             ty               = _mm256_mul_ps(fscal,dy20);
2939             tz               = _mm256_mul_ps(fscal,dz20);
2940
2941             /* Update vectorial force */
2942             fix2             = _mm256_add_ps(fix2,tx);
2943             fiy2             = _mm256_add_ps(fiy2,ty);
2944             fiz2             = _mm256_add_ps(fiz2,tz);
2945
2946             fjx0             = _mm256_add_ps(fjx0,tx);
2947             fjy0             = _mm256_add_ps(fjy0,ty);
2948             fjz0             = _mm256_add_ps(fjz0,tz);
2949
2950             }
2951
2952             /**************************
2953              * CALCULATE INTERACTIONS *
2954              **************************/
2955
2956             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2957             {
2958
2959             r21              = _mm256_mul_ps(rsq21,rinv21);
2960             r21              = _mm256_andnot_ps(dummy_mask,r21);
2961
2962             /* EWALD ELECTROSTATICS */
2963             
2964             /* Analytical PME correction */
2965             zeta2            = _mm256_mul_ps(beta2,rsq21);
2966             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2967             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2968             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2969             felec            = _mm256_mul_ps(qq21,felec);
2970             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2971             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2972             velec            = _mm256_sub_ps(rinv21,pmecorrV);
2973             velec            = _mm256_mul_ps(qq21,velec);
2974             
2975             d                = _mm256_sub_ps(r21,rswitch);
2976             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2977             d2               = _mm256_mul_ps(d,d);
2978             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2979
2980             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2981
2982             /* Evaluate switch function */
2983             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2984             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2985             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2986
2987             fscal            = felec;
2988
2989             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2990
2991             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2992
2993             /* Calculate temporary vectorial force */
2994             tx               = _mm256_mul_ps(fscal,dx21);
2995             ty               = _mm256_mul_ps(fscal,dy21);
2996             tz               = _mm256_mul_ps(fscal,dz21);
2997
2998             /* Update vectorial force */
2999             fix2             = _mm256_add_ps(fix2,tx);
3000             fiy2             = _mm256_add_ps(fiy2,ty);
3001             fiz2             = _mm256_add_ps(fiz2,tz);
3002
3003             fjx1             = _mm256_add_ps(fjx1,tx);
3004             fjy1             = _mm256_add_ps(fjy1,ty);
3005             fjz1             = _mm256_add_ps(fjz1,tz);
3006
3007             }
3008
3009             /**************************
3010              * CALCULATE INTERACTIONS *
3011              **************************/
3012
3013             if (gmx_mm256_any_lt(rsq22,rcutoff2))
3014             {
3015
3016             r22              = _mm256_mul_ps(rsq22,rinv22);
3017             r22              = _mm256_andnot_ps(dummy_mask,r22);
3018
3019             /* EWALD ELECTROSTATICS */
3020             
3021             /* Analytical PME correction */
3022             zeta2            = _mm256_mul_ps(beta2,rsq22);
3023             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
3024             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
3025             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3026             felec            = _mm256_mul_ps(qq22,felec);
3027             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
3028             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
3029             velec            = _mm256_sub_ps(rinv22,pmecorrV);
3030             velec            = _mm256_mul_ps(qq22,velec);
3031             
3032             d                = _mm256_sub_ps(r22,rswitch);
3033             d                = _mm256_max_ps(d,_mm256_setzero_ps());
3034             d2               = _mm256_mul_ps(d,d);
3035             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3036
3037             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3038
3039             /* Evaluate switch function */
3040             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3041             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
3042             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
3043
3044             fscal            = felec;
3045
3046             fscal            = _mm256_and_ps(fscal,cutoff_mask);
3047
3048             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
3049
3050             /* Calculate temporary vectorial force */
3051             tx               = _mm256_mul_ps(fscal,dx22);
3052             ty               = _mm256_mul_ps(fscal,dy22);
3053             tz               = _mm256_mul_ps(fscal,dz22);
3054
3055             /* Update vectorial force */
3056             fix2             = _mm256_add_ps(fix2,tx);
3057             fiy2             = _mm256_add_ps(fiy2,ty);
3058             fiz2             = _mm256_add_ps(fiz2,tz);
3059
3060             fjx2             = _mm256_add_ps(fjx2,tx);
3061             fjy2             = _mm256_add_ps(fjy2,ty);
3062             fjz2             = _mm256_add_ps(fjz2,tz);
3063
3064             }
3065
3066             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3067             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3068             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3069             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3070             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3071             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3072             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3073             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3074
3075             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
3076                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3077
3078             /* Inner loop uses 969 flops */
3079         }
3080
3081         /* End of innermost loop */
3082
3083         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3084                                                  f+i_coord_offset,fshift+i_shift_offset);
3085
3086         /* Increment number of inner iterations */
3087         inneriter                  += j_index_end - j_index_start;
3088
3089         /* Outer loop uses 18 flops */
3090     }
3091
3092     /* Increment number of outer iterations */
3093     outeriter        += nri;
3094
3095     /* Update outer/inner flops */
3096
3097     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*969);
3098 }