made errors during GPU detection non-fatal
[alexxy/gromacs.git] / src / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_avx_256_single.c
1 /*
2  * Note: this file was generated by the Gromacs avx_256_single kernel generator.
3  *
4  *                This source code is part of
5  *
6  *                 G   R   O   M   A   C   S
7  *
8  * Copyright (c) 2001-2012, The GROMACS Development Team
9  *
10  * Gromacs is a library for molecular simulation and trajectory analysis,
11  * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12  * a full list of developers and information, check out http://www.gromacs.org
13  *
14  * This program is free software; you can redistribute it and/or modify it under
15  * the terms of the GNU Lesser General Public License as published by the Free
16  * Software Foundation; either version 2 of the License, or (at your option) any
17  * later version.
18  *
19  * To help fund GROMACS development, we humbly ask that you cite
20  * the papers people have written on it - you can find them on the website.
21  */
22 #ifdef HAVE_CONFIG_H
23 #include <config.h>
24 #endif
25
26 #include <math.h>
27
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
30 #include "vec.h"
31 #include "nrnb.h"
32
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
35
36 /*
37  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_single
38  * Electrostatics interaction: Ewald
39  * VdW interaction:            LennardJones
40  * Geometry:                   Water3-Water3
41  * Calculate force/pot:        PotentialAndForce
42  */
43 void
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_VF_avx_256_single
45                     (t_nblist * gmx_restrict                nlist,
46                      rvec * gmx_restrict                    xx,
47                      rvec * gmx_restrict                    ff,
48                      t_forcerec * gmx_restrict              fr,
49                      t_mdatoms * gmx_restrict               mdatoms,
50                      nb_kernel_data_t * gmx_restrict        kernel_data,
51                      t_nrnb * gmx_restrict                  nrnb)
52 {
53     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
54      * just 0 for non-waters.
55      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56      * jnr indices corresponding to data put in the four positions in the SIMD register.
57      */
58     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
59     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60     int              jnrA,jnrB,jnrC,jnrD;
61     int              jnrE,jnrF,jnrG,jnrH;
62     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
67     real             rcutoff_scalar;
68     real             *shiftvec,*fshift,*x,*f;
69     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
70     real             scratch[4*DIM];
71     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72     real *           vdwioffsetptr0;
73     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74     real *           vdwioffsetptr1;
75     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76     real *           vdwioffsetptr2;
77     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
94     real             *charge;
95     int              nvdwtype;
96     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97     int              *vdwtype;
98     real             *vdwparam;
99     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
100     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
101     __m256i          ewitab;
102     __m128i          ewitab_lo,ewitab_hi;
103     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
104     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
105     real             *ewtab;
106     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
107     real             rswitch_scalar,d_scalar;
108     __m256           dummy_mask,cutoff_mask;
109     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
110     __m256           one     = _mm256_set1_ps(1.0);
111     __m256           two     = _mm256_set1_ps(2.0);
112     x                = xx[0];
113     f                = ff[0];
114
115     nri              = nlist->nri;
116     iinr             = nlist->iinr;
117     jindex           = nlist->jindex;
118     jjnr             = nlist->jjnr;
119     shiftidx         = nlist->shift;
120     gid              = nlist->gid;
121     shiftvec         = fr->shift_vec[0];
122     fshift           = fr->fshift[0];
123     facel            = _mm256_set1_ps(fr->epsfac);
124     charge           = mdatoms->chargeA;
125     nvdwtype         = fr->ntype;
126     vdwparam         = fr->nbfp;
127     vdwtype          = mdatoms->typeA;
128
129     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
130     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
131     beta2            = _mm256_mul_ps(beta,beta);
132     beta3            = _mm256_mul_ps(beta,beta2);
133
134     ewtab            = fr->ic->tabq_coul_FDV0;
135     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
136     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
137
138     /* Setup water-specific parameters */
139     inr              = nlist->iinr[0];
140     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
141     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
142     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
143     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
144
145     jq0              = _mm256_set1_ps(charge[inr+0]);
146     jq1              = _mm256_set1_ps(charge[inr+1]);
147     jq2              = _mm256_set1_ps(charge[inr+2]);
148     vdwjidx0A        = 2*vdwtype[inr+0];
149     qq00             = _mm256_mul_ps(iq0,jq0);
150     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
151     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
152     qq01             = _mm256_mul_ps(iq0,jq1);
153     qq02             = _mm256_mul_ps(iq0,jq2);
154     qq10             = _mm256_mul_ps(iq1,jq0);
155     qq11             = _mm256_mul_ps(iq1,jq1);
156     qq12             = _mm256_mul_ps(iq1,jq2);
157     qq20             = _mm256_mul_ps(iq2,jq0);
158     qq21             = _mm256_mul_ps(iq2,jq1);
159     qq22             = _mm256_mul_ps(iq2,jq2);
160
161     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
162     rcutoff_scalar   = fr->rcoulomb;
163     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
164     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
165
166     rswitch_scalar   = fr->rcoulomb_switch;
167     rswitch          = _mm256_set1_ps(rswitch_scalar);
168     /* Setup switch parameters */
169     d_scalar         = rcutoff_scalar-rswitch_scalar;
170     d                = _mm256_set1_ps(d_scalar);
171     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
172     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
173     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
174     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
175     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
176     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
177
178     /* Avoid stupid compiler warnings */
179     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
180     j_coord_offsetA = 0;
181     j_coord_offsetB = 0;
182     j_coord_offsetC = 0;
183     j_coord_offsetD = 0;
184     j_coord_offsetE = 0;
185     j_coord_offsetF = 0;
186     j_coord_offsetG = 0;
187     j_coord_offsetH = 0;
188
189     outeriter        = 0;
190     inneriter        = 0;
191
192     for(iidx=0;iidx<4*DIM;iidx++)
193     {
194         scratch[iidx] = 0.0;
195     }
196
197     /* Start outer loop over neighborlists */
198     for(iidx=0; iidx<nri; iidx++)
199     {
200         /* Load shift vector for this list */
201         i_shift_offset   = DIM*shiftidx[iidx];
202
203         /* Load limits for loop over neighbors */
204         j_index_start    = jindex[iidx];
205         j_index_end      = jindex[iidx+1];
206
207         /* Get outer coordinate index */
208         inr              = iinr[iidx];
209         i_coord_offset   = DIM*inr;
210
211         /* Load i particle coords and add shift vector */
212         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
213                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
214
215         fix0             = _mm256_setzero_ps();
216         fiy0             = _mm256_setzero_ps();
217         fiz0             = _mm256_setzero_ps();
218         fix1             = _mm256_setzero_ps();
219         fiy1             = _mm256_setzero_ps();
220         fiz1             = _mm256_setzero_ps();
221         fix2             = _mm256_setzero_ps();
222         fiy2             = _mm256_setzero_ps();
223         fiz2             = _mm256_setzero_ps();
224
225         /* Reset potential sums */
226         velecsum         = _mm256_setzero_ps();
227         vvdwsum          = _mm256_setzero_ps();
228
229         /* Start inner kernel loop */
230         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
231         {
232
233             /* Get j neighbor index, and coordinate index */
234             jnrA             = jjnr[jidx];
235             jnrB             = jjnr[jidx+1];
236             jnrC             = jjnr[jidx+2];
237             jnrD             = jjnr[jidx+3];
238             jnrE             = jjnr[jidx+4];
239             jnrF             = jjnr[jidx+5];
240             jnrG             = jjnr[jidx+6];
241             jnrH             = jjnr[jidx+7];
242             j_coord_offsetA  = DIM*jnrA;
243             j_coord_offsetB  = DIM*jnrB;
244             j_coord_offsetC  = DIM*jnrC;
245             j_coord_offsetD  = DIM*jnrD;
246             j_coord_offsetE  = DIM*jnrE;
247             j_coord_offsetF  = DIM*jnrF;
248             j_coord_offsetG  = DIM*jnrG;
249             j_coord_offsetH  = DIM*jnrH;
250
251             /* load j atom coordinates */
252             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
253                                                  x+j_coord_offsetC,x+j_coord_offsetD,
254                                                  x+j_coord_offsetE,x+j_coord_offsetF,
255                                                  x+j_coord_offsetG,x+j_coord_offsetH,
256                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
257
258             /* Calculate displacement vector */
259             dx00             = _mm256_sub_ps(ix0,jx0);
260             dy00             = _mm256_sub_ps(iy0,jy0);
261             dz00             = _mm256_sub_ps(iz0,jz0);
262             dx01             = _mm256_sub_ps(ix0,jx1);
263             dy01             = _mm256_sub_ps(iy0,jy1);
264             dz01             = _mm256_sub_ps(iz0,jz1);
265             dx02             = _mm256_sub_ps(ix0,jx2);
266             dy02             = _mm256_sub_ps(iy0,jy2);
267             dz02             = _mm256_sub_ps(iz0,jz2);
268             dx10             = _mm256_sub_ps(ix1,jx0);
269             dy10             = _mm256_sub_ps(iy1,jy0);
270             dz10             = _mm256_sub_ps(iz1,jz0);
271             dx11             = _mm256_sub_ps(ix1,jx1);
272             dy11             = _mm256_sub_ps(iy1,jy1);
273             dz11             = _mm256_sub_ps(iz1,jz1);
274             dx12             = _mm256_sub_ps(ix1,jx2);
275             dy12             = _mm256_sub_ps(iy1,jy2);
276             dz12             = _mm256_sub_ps(iz1,jz2);
277             dx20             = _mm256_sub_ps(ix2,jx0);
278             dy20             = _mm256_sub_ps(iy2,jy0);
279             dz20             = _mm256_sub_ps(iz2,jz0);
280             dx21             = _mm256_sub_ps(ix2,jx1);
281             dy21             = _mm256_sub_ps(iy2,jy1);
282             dz21             = _mm256_sub_ps(iz2,jz1);
283             dx22             = _mm256_sub_ps(ix2,jx2);
284             dy22             = _mm256_sub_ps(iy2,jy2);
285             dz22             = _mm256_sub_ps(iz2,jz2);
286
287             /* Calculate squared distance and things based on it */
288             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
289             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
290             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
291             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
292             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
293             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
294             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
295             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
296             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
297
298             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
299             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
300             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
301             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
302             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
303             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
304             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
305             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
306             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
307
308             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
309             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
310             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
311             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
312             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
313             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
314             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
315             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
316             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
317
318             fjx0             = _mm256_setzero_ps();
319             fjy0             = _mm256_setzero_ps();
320             fjz0             = _mm256_setzero_ps();
321             fjx1             = _mm256_setzero_ps();
322             fjy1             = _mm256_setzero_ps();
323             fjz1             = _mm256_setzero_ps();
324             fjx2             = _mm256_setzero_ps();
325             fjy2             = _mm256_setzero_ps();
326             fjz2             = _mm256_setzero_ps();
327
328             /**************************
329              * CALCULATE INTERACTIONS *
330              **************************/
331
332             if (gmx_mm256_any_lt(rsq00,rcutoff2))
333             {
334
335             r00              = _mm256_mul_ps(rsq00,rinv00);
336
337             /* EWALD ELECTROSTATICS */
338             
339             /* Analytical PME correction */
340             zeta2            = _mm256_mul_ps(beta2,rsq00);
341             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
342             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
343             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
344             felec            = _mm256_mul_ps(qq00,felec);
345             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
346             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
347             velec            = _mm256_sub_ps(rinv00,pmecorrV);
348             velec            = _mm256_mul_ps(qq00,velec);
349             
350             /* LENNARD-JONES DISPERSION/REPULSION */
351
352             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
353             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
354             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
355             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
356             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
357
358             d                = _mm256_sub_ps(r00,rswitch);
359             d                = _mm256_max_ps(d,_mm256_setzero_ps());
360             d2               = _mm256_mul_ps(d,d);
361             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
362
363             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
364
365             /* Evaluate switch function */
366             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
367             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
368             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
369             velec            = _mm256_mul_ps(velec,sw);
370             vvdw             = _mm256_mul_ps(vvdw,sw);
371             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
372
373             /* Update potential sum for this i atom from the interaction with this j atom. */
374             velec            = _mm256_and_ps(velec,cutoff_mask);
375             velecsum         = _mm256_add_ps(velecsum,velec);
376             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
377             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
378
379             fscal            = _mm256_add_ps(felec,fvdw);
380
381             fscal            = _mm256_and_ps(fscal,cutoff_mask);
382
383             /* Calculate temporary vectorial force */
384             tx               = _mm256_mul_ps(fscal,dx00);
385             ty               = _mm256_mul_ps(fscal,dy00);
386             tz               = _mm256_mul_ps(fscal,dz00);
387
388             /* Update vectorial force */
389             fix0             = _mm256_add_ps(fix0,tx);
390             fiy0             = _mm256_add_ps(fiy0,ty);
391             fiz0             = _mm256_add_ps(fiz0,tz);
392
393             fjx0             = _mm256_add_ps(fjx0,tx);
394             fjy0             = _mm256_add_ps(fjy0,ty);
395             fjz0             = _mm256_add_ps(fjz0,tz);
396
397             }
398
399             /**************************
400              * CALCULATE INTERACTIONS *
401              **************************/
402
403             if (gmx_mm256_any_lt(rsq01,rcutoff2))
404             {
405
406             r01              = _mm256_mul_ps(rsq01,rinv01);
407
408             /* EWALD ELECTROSTATICS */
409             
410             /* Analytical PME correction */
411             zeta2            = _mm256_mul_ps(beta2,rsq01);
412             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
413             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
414             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
415             felec            = _mm256_mul_ps(qq01,felec);
416             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
417             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
418             velec            = _mm256_sub_ps(rinv01,pmecorrV);
419             velec            = _mm256_mul_ps(qq01,velec);
420             
421             d                = _mm256_sub_ps(r01,rswitch);
422             d                = _mm256_max_ps(d,_mm256_setzero_ps());
423             d2               = _mm256_mul_ps(d,d);
424             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
425
426             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
427
428             /* Evaluate switch function */
429             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
430             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
431             velec            = _mm256_mul_ps(velec,sw);
432             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
433
434             /* Update potential sum for this i atom from the interaction with this j atom. */
435             velec            = _mm256_and_ps(velec,cutoff_mask);
436             velecsum         = _mm256_add_ps(velecsum,velec);
437
438             fscal            = felec;
439
440             fscal            = _mm256_and_ps(fscal,cutoff_mask);
441
442             /* Calculate temporary vectorial force */
443             tx               = _mm256_mul_ps(fscal,dx01);
444             ty               = _mm256_mul_ps(fscal,dy01);
445             tz               = _mm256_mul_ps(fscal,dz01);
446
447             /* Update vectorial force */
448             fix0             = _mm256_add_ps(fix0,tx);
449             fiy0             = _mm256_add_ps(fiy0,ty);
450             fiz0             = _mm256_add_ps(fiz0,tz);
451
452             fjx1             = _mm256_add_ps(fjx1,tx);
453             fjy1             = _mm256_add_ps(fjy1,ty);
454             fjz1             = _mm256_add_ps(fjz1,tz);
455
456             }
457
458             /**************************
459              * CALCULATE INTERACTIONS *
460              **************************/
461
462             if (gmx_mm256_any_lt(rsq02,rcutoff2))
463             {
464
465             r02              = _mm256_mul_ps(rsq02,rinv02);
466
467             /* EWALD ELECTROSTATICS */
468             
469             /* Analytical PME correction */
470             zeta2            = _mm256_mul_ps(beta2,rsq02);
471             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
472             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
473             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
474             felec            = _mm256_mul_ps(qq02,felec);
475             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
476             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
477             velec            = _mm256_sub_ps(rinv02,pmecorrV);
478             velec            = _mm256_mul_ps(qq02,velec);
479             
480             d                = _mm256_sub_ps(r02,rswitch);
481             d                = _mm256_max_ps(d,_mm256_setzero_ps());
482             d2               = _mm256_mul_ps(d,d);
483             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
484
485             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
486
487             /* Evaluate switch function */
488             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
489             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
490             velec            = _mm256_mul_ps(velec,sw);
491             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
492
493             /* Update potential sum for this i atom from the interaction with this j atom. */
494             velec            = _mm256_and_ps(velec,cutoff_mask);
495             velecsum         = _mm256_add_ps(velecsum,velec);
496
497             fscal            = felec;
498
499             fscal            = _mm256_and_ps(fscal,cutoff_mask);
500
501             /* Calculate temporary vectorial force */
502             tx               = _mm256_mul_ps(fscal,dx02);
503             ty               = _mm256_mul_ps(fscal,dy02);
504             tz               = _mm256_mul_ps(fscal,dz02);
505
506             /* Update vectorial force */
507             fix0             = _mm256_add_ps(fix0,tx);
508             fiy0             = _mm256_add_ps(fiy0,ty);
509             fiz0             = _mm256_add_ps(fiz0,tz);
510
511             fjx2             = _mm256_add_ps(fjx2,tx);
512             fjy2             = _mm256_add_ps(fjy2,ty);
513             fjz2             = _mm256_add_ps(fjz2,tz);
514
515             }
516
517             /**************************
518              * CALCULATE INTERACTIONS *
519              **************************/
520
521             if (gmx_mm256_any_lt(rsq10,rcutoff2))
522             {
523
524             r10              = _mm256_mul_ps(rsq10,rinv10);
525
526             /* EWALD ELECTROSTATICS */
527             
528             /* Analytical PME correction */
529             zeta2            = _mm256_mul_ps(beta2,rsq10);
530             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
531             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
532             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
533             felec            = _mm256_mul_ps(qq10,felec);
534             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
535             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
536             velec            = _mm256_sub_ps(rinv10,pmecorrV);
537             velec            = _mm256_mul_ps(qq10,velec);
538             
539             d                = _mm256_sub_ps(r10,rswitch);
540             d                = _mm256_max_ps(d,_mm256_setzero_ps());
541             d2               = _mm256_mul_ps(d,d);
542             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
543
544             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
545
546             /* Evaluate switch function */
547             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
548             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
549             velec            = _mm256_mul_ps(velec,sw);
550             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
551
552             /* Update potential sum for this i atom from the interaction with this j atom. */
553             velec            = _mm256_and_ps(velec,cutoff_mask);
554             velecsum         = _mm256_add_ps(velecsum,velec);
555
556             fscal            = felec;
557
558             fscal            = _mm256_and_ps(fscal,cutoff_mask);
559
560             /* Calculate temporary vectorial force */
561             tx               = _mm256_mul_ps(fscal,dx10);
562             ty               = _mm256_mul_ps(fscal,dy10);
563             tz               = _mm256_mul_ps(fscal,dz10);
564
565             /* Update vectorial force */
566             fix1             = _mm256_add_ps(fix1,tx);
567             fiy1             = _mm256_add_ps(fiy1,ty);
568             fiz1             = _mm256_add_ps(fiz1,tz);
569
570             fjx0             = _mm256_add_ps(fjx0,tx);
571             fjy0             = _mm256_add_ps(fjy0,ty);
572             fjz0             = _mm256_add_ps(fjz0,tz);
573
574             }
575
576             /**************************
577              * CALCULATE INTERACTIONS *
578              **************************/
579
580             if (gmx_mm256_any_lt(rsq11,rcutoff2))
581             {
582
583             r11              = _mm256_mul_ps(rsq11,rinv11);
584
585             /* EWALD ELECTROSTATICS */
586             
587             /* Analytical PME correction */
588             zeta2            = _mm256_mul_ps(beta2,rsq11);
589             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
590             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
591             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
592             felec            = _mm256_mul_ps(qq11,felec);
593             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
594             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
595             velec            = _mm256_sub_ps(rinv11,pmecorrV);
596             velec            = _mm256_mul_ps(qq11,velec);
597             
598             d                = _mm256_sub_ps(r11,rswitch);
599             d                = _mm256_max_ps(d,_mm256_setzero_ps());
600             d2               = _mm256_mul_ps(d,d);
601             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
602
603             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
604
605             /* Evaluate switch function */
606             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
607             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
608             velec            = _mm256_mul_ps(velec,sw);
609             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
610
611             /* Update potential sum for this i atom from the interaction with this j atom. */
612             velec            = _mm256_and_ps(velec,cutoff_mask);
613             velecsum         = _mm256_add_ps(velecsum,velec);
614
615             fscal            = felec;
616
617             fscal            = _mm256_and_ps(fscal,cutoff_mask);
618
619             /* Calculate temporary vectorial force */
620             tx               = _mm256_mul_ps(fscal,dx11);
621             ty               = _mm256_mul_ps(fscal,dy11);
622             tz               = _mm256_mul_ps(fscal,dz11);
623
624             /* Update vectorial force */
625             fix1             = _mm256_add_ps(fix1,tx);
626             fiy1             = _mm256_add_ps(fiy1,ty);
627             fiz1             = _mm256_add_ps(fiz1,tz);
628
629             fjx1             = _mm256_add_ps(fjx1,tx);
630             fjy1             = _mm256_add_ps(fjy1,ty);
631             fjz1             = _mm256_add_ps(fjz1,tz);
632
633             }
634
635             /**************************
636              * CALCULATE INTERACTIONS *
637              **************************/
638
639             if (gmx_mm256_any_lt(rsq12,rcutoff2))
640             {
641
642             r12              = _mm256_mul_ps(rsq12,rinv12);
643
644             /* EWALD ELECTROSTATICS */
645             
646             /* Analytical PME correction */
647             zeta2            = _mm256_mul_ps(beta2,rsq12);
648             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
649             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
650             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
651             felec            = _mm256_mul_ps(qq12,felec);
652             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
653             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
654             velec            = _mm256_sub_ps(rinv12,pmecorrV);
655             velec            = _mm256_mul_ps(qq12,velec);
656             
657             d                = _mm256_sub_ps(r12,rswitch);
658             d                = _mm256_max_ps(d,_mm256_setzero_ps());
659             d2               = _mm256_mul_ps(d,d);
660             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
661
662             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
663
664             /* Evaluate switch function */
665             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
666             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
667             velec            = _mm256_mul_ps(velec,sw);
668             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
669
670             /* Update potential sum for this i atom from the interaction with this j atom. */
671             velec            = _mm256_and_ps(velec,cutoff_mask);
672             velecsum         = _mm256_add_ps(velecsum,velec);
673
674             fscal            = felec;
675
676             fscal            = _mm256_and_ps(fscal,cutoff_mask);
677
678             /* Calculate temporary vectorial force */
679             tx               = _mm256_mul_ps(fscal,dx12);
680             ty               = _mm256_mul_ps(fscal,dy12);
681             tz               = _mm256_mul_ps(fscal,dz12);
682
683             /* Update vectorial force */
684             fix1             = _mm256_add_ps(fix1,tx);
685             fiy1             = _mm256_add_ps(fiy1,ty);
686             fiz1             = _mm256_add_ps(fiz1,tz);
687
688             fjx2             = _mm256_add_ps(fjx2,tx);
689             fjy2             = _mm256_add_ps(fjy2,ty);
690             fjz2             = _mm256_add_ps(fjz2,tz);
691
692             }
693
694             /**************************
695              * CALCULATE INTERACTIONS *
696              **************************/
697
698             if (gmx_mm256_any_lt(rsq20,rcutoff2))
699             {
700
701             r20              = _mm256_mul_ps(rsq20,rinv20);
702
703             /* EWALD ELECTROSTATICS */
704             
705             /* Analytical PME correction */
706             zeta2            = _mm256_mul_ps(beta2,rsq20);
707             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
708             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
709             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
710             felec            = _mm256_mul_ps(qq20,felec);
711             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
712             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
713             velec            = _mm256_sub_ps(rinv20,pmecorrV);
714             velec            = _mm256_mul_ps(qq20,velec);
715             
716             d                = _mm256_sub_ps(r20,rswitch);
717             d                = _mm256_max_ps(d,_mm256_setzero_ps());
718             d2               = _mm256_mul_ps(d,d);
719             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
720
721             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
722
723             /* Evaluate switch function */
724             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
725             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
726             velec            = _mm256_mul_ps(velec,sw);
727             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
728
729             /* Update potential sum for this i atom from the interaction with this j atom. */
730             velec            = _mm256_and_ps(velec,cutoff_mask);
731             velecsum         = _mm256_add_ps(velecsum,velec);
732
733             fscal            = felec;
734
735             fscal            = _mm256_and_ps(fscal,cutoff_mask);
736
737             /* Calculate temporary vectorial force */
738             tx               = _mm256_mul_ps(fscal,dx20);
739             ty               = _mm256_mul_ps(fscal,dy20);
740             tz               = _mm256_mul_ps(fscal,dz20);
741
742             /* Update vectorial force */
743             fix2             = _mm256_add_ps(fix2,tx);
744             fiy2             = _mm256_add_ps(fiy2,ty);
745             fiz2             = _mm256_add_ps(fiz2,tz);
746
747             fjx0             = _mm256_add_ps(fjx0,tx);
748             fjy0             = _mm256_add_ps(fjy0,ty);
749             fjz0             = _mm256_add_ps(fjz0,tz);
750
751             }
752
753             /**************************
754              * CALCULATE INTERACTIONS *
755              **************************/
756
757             if (gmx_mm256_any_lt(rsq21,rcutoff2))
758             {
759
760             r21              = _mm256_mul_ps(rsq21,rinv21);
761
762             /* EWALD ELECTROSTATICS */
763             
764             /* Analytical PME correction */
765             zeta2            = _mm256_mul_ps(beta2,rsq21);
766             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
767             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
768             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
769             felec            = _mm256_mul_ps(qq21,felec);
770             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
771             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
772             velec            = _mm256_sub_ps(rinv21,pmecorrV);
773             velec            = _mm256_mul_ps(qq21,velec);
774             
775             d                = _mm256_sub_ps(r21,rswitch);
776             d                = _mm256_max_ps(d,_mm256_setzero_ps());
777             d2               = _mm256_mul_ps(d,d);
778             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
779
780             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
781
782             /* Evaluate switch function */
783             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
784             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
785             velec            = _mm256_mul_ps(velec,sw);
786             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
787
788             /* Update potential sum for this i atom from the interaction with this j atom. */
789             velec            = _mm256_and_ps(velec,cutoff_mask);
790             velecsum         = _mm256_add_ps(velecsum,velec);
791
792             fscal            = felec;
793
794             fscal            = _mm256_and_ps(fscal,cutoff_mask);
795
796             /* Calculate temporary vectorial force */
797             tx               = _mm256_mul_ps(fscal,dx21);
798             ty               = _mm256_mul_ps(fscal,dy21);
799             tz               = _mm256_mul_ps(fscal,dz21);
800
801             /* Update vectorial force */
802             fix2             = _mm256_add_ps(fix2,tx);
803             fiy2             = _mm256_add_ps(fiy2,ty);
804             fiz2             = _mm256_add_ps(fiz2,tz);
805
806             fjx1             = _mm256_add_ps(fjx1,tx);
807             fjy1             = _mm256_add_ps(fjy1,ty);
808             fjz1             = _mm256_add_ps(fjz1,tz);
809
810             }
811
812             /**************************
813              * CALCULATE INTERACTIONS *
814              **************************/
815
816             if (gmx_mm256_any_lt(rsq22,rcutoff2))
817             {
818
819             r22              = _mm256_mul_ps(rsq22,rinv22);
820
821             /* EWALD ELECTROSTATICS */
822             
823             /* Analytical PME correction */
824             zeta2            = _mm256_mul_ps(beta2,rsq22);
825             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
826             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
827             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
828             felec            = _mm256_mul_ps(qq22,felec);
829             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
830             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
831             velec            = _mm256_sub_ps(rinv22,pmecorrV);
832             velec            = _mm256_mul_ps(qq22,velec);
833             
834             d                = _mm256_sub_ps(r22,rswitch);
835             d                = _mm256_max_ps(d,_mm256_setzero_ps());
836             d2               = _mm256_mul_ps(d,d);
837             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
838
839             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
840
841             /* Evaluate switch function */
842             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
843             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
844             velec            = _mm256_mul_ps(velec,sw);
845             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
846
847             /* Update potential sum for this i atom from the interaction with this j atom. */
848             velec            = _mm256_and_ps(velec,cutoff_mask);
849             velecsum         = _mm256_add_ps(velecsum,velec);
850
851             fscal            = felec;
852
853             fscal            = _mm256_and_ps(fscal,cutoff_mask);
854
855             /* Calculate temporary vectorial force */
856             tx               = _mm256_mul_ps(fscal,dx22);
857             ty               = _mm256_mul_ps(fscal,dy22);
858             tz               = _mm256_mul_ps(fscal,dz22);
859
860             /* Update vectorial force */
861             fix2             = _mm256_add_ps(fix2,tx);
862             fiy2             = _mm256_add_ps(fiy2,ty);
863             fiz2             = _mm256_add_ps(fiz2,tz);
864
865             fjx2             = _mm256_add_ps(fjx2,tx);
866             fjy2             = _mm256_add_ps(fjy2,ty);
867             fjz2             = _mm256_add_ps(fjz2,tz);
868
869             }
870
871             fjptrA             = f+j_coord_offsetA;
872             fjptrB             = f+j_coord_offsetB;
873             fjptrC             = f+j_coord_offsetC;
874             fjptrD             = f+j_coord_offsetD;
875             fjptrE             = f+j_coord_offsetE;
876             fjptrF             = f+j_coord_offsetF;
877             fjptrG             = f+j_coord_offsetG;
878             fjptrH             = f+j_coord_offsetH;
879
880             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
881                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
882
883             /* Inner loop uses 990 flops */
884         }
885
886         if(jidx<j_index_end)
887         {
888
889             /* Get j neighbor index, and coordinate index */
890             jnrlistA         = jjnr[jidx];
891             jnrlistB         = jjnr[jidx+1];
892             jnrlistC         = jjnr[jidx+2];
893             jnrlistD         = jjnr[jidx+3];
894             jnrlistE         = jjnr[jidx+4];
895             jnrlistF         = jjnr[jidx+5];
896             jnrlistG         = jjnr[jidx+6];
897             jnrlistH         = jjnr[jidx+7];
898             /* Sign of each element will be negative for non-real atoms.
899              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
900              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
901              */
902             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
903                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
904                                             
905             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
906             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
907             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
908             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
909             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
910             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
911             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
912             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
913             j_coord_offsetA  = DIM*jnrA;
914             j_coord_offsetB  = DIM*jnrB;
915             j_coord_offsetC  = DIM*jnrC;
916             j_coord_offsetD  = DIM*jnrD;
917             j_coord_offsetE  = DIM*jnrE;
918             j_coord_offsetF  = DIM*jnrF;
919             j_coord_offsetG  = DIM*jnrG;
920             j_coord_offsetH  = DIM*jnrH;
921
922             /* load j atom coordinates */
923             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
924                                                  x+j_coord_offsetC,x+j_coord_offsetD,
925                                                  x+j_coord_offsetE,x+j_coord_offsetF,
926                                                  x+j_coord_offsetG,x+j_coord_offsetH,
927                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
928
929             /* Calculate displacement vector */
930             dx00             = _mm256_sub_ps(ix0,jx0);
931             dy00             = _mm256_sub_ps(iy0,jy0);
932             dz00             = _mm256_sub_ps(iz0,jz0);
933             dx01             = _mm256_sub_ps(ix0,jx1);
934             dy01             = _mm256_sub_ps(iy0,jy1);
935             dz01             = _mm256_sub_ps(iz0,jz1);
936             dx02             = _mm256_sub_ps(ix0,jx2);
937             dy02             = _mm256_sub_ps(iy0,jy2);
938             dz02             = _mm256_sub_ps(iz0,jz2);
939             dx10             = _mm256_sub_ps(ix1,jx0);
940             dy10             = _mm256_sub_ps(iy1,jy0);
941             dz10             = _mm256_sub_ps(iz1,jz0);
942             dx11             = _mm256_sub_ps(ix1,jx1);
943             dy11             = _mm256_sub_ps(iy1,jy1);
944             dz11             = _mm256_sub_ps(iz1,jz1);
945             dx12             = _mm256_sub_ps(ix1,jx2);
946             dy12             = _mm256_sub_ps(iy1,jy2);
947             dz12             = _mm256_sub_ps(iz1,jz2);
948             dx20             = _mm256_sub_ps(ix2,jx0);
949             dy20             = _mm256_sub_ps(iy2,jy0);
950             dz20             = _mm256_sub_ps(iz2,jz0);
951             dx21             = _mm256_sub_ps(ix2,jx1);
952             dy21             = _mm256_sub_ps(iy2,jy1);
953             dz21             = _mm256_sub_ps(iz2,jz1);
954             dx22             = _mm256_sub_ps(ix2,jx2);
955             dy22             = _mm256_sub_ps(iy2,jy2);
956             dz22             = _mm256_sub_ps(iz2,jz2);
957
958             /* Calculate squared distance and things based on it */
959             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
960             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
961             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
962             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
963             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
964             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
965             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
966             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
967             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
968
969             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
970             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
971             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
972             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
973             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
974             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
975             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
976             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
977             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
978
979             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
980             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
981             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
982             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
983             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
984             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
985             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
986             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
987             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
988
989             fjx0             = _mm256_setzero_ps();
990             fjy0             = _mm256_setzero_ps();
991             fjz0             = _mm256_setzero_ps();
992             fjx1             = _mm256_setzero_ps();
993             fjy1             = _mm256_setzero_ps();
994             fjz1             = _mm256_setzero_ps();
995             fjx2             = _mm256_setzero_ps();
996             fjy2             = _mm256_setzero_ps();
997             fjz2             = _mm256_setzero_ps();
998
999             /**************************
1000              * CALCULATE INTERACTIONS *
1001              **************************/
1002
1003             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1004             {
1005
1006             r00              = _mm256_mul_ps(rsq00,rinv00);
1007             r00              = _mm256_andnot_ps(dummy_mask,r00);
1008
1009             /* EWALD ELECTROSTATICS */
1010             
1011             /* Analytical PME correction */
1012             zeta2            = _mm256_mul_ps(beta2,rsq00);
1013             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1014             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1015             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1016             felec            = _mm256_mul_ps(qq00,felec);
1017             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1018             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1019             velec            = _mm256_sub_ps(rinv00,pmecorrV);
1020             velec            = _mm256_mul_ps(qq00,velec);
1021             
1022             /* LENNARD-JONES DISPERSION/REPULSION */
1023
1024             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1025             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
1026             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1027             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1028             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1029
1030             d                = _mm256_sub_ps(r00,rswitch);
1031             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1032             d2               = _mm256_mul_ps(d,d);
1033             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1034
1035             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1036
1037             /* Evaluate switch function */
1038             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1039             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1040             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1041             velec            = _mm256_mul_ps(velec,sw);
1042             vvdw             = _mm256_mul_ps(vvdw,sw);
1043             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1044
1045             /* Update potential sum for this i atom from the interaction with this j atom. */
1046             velec            = _mm256_and_ps(velec,cutoff_mask);
1047             velec            = _mm256_andnot_ps(dummy_mask,velec);
1048             velecsum         = _mm256_add_ps(velecsum,velec);
1049             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
1050             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
1051             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
1052
1053             fscal            = _mm256_add_ps(felec,fvdw);
1054
1055             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1056
1057             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1058
1059             /* Calculate temporary vectorial force */
1060             tx               = _mm256_mul_ps(fscal,dx00);
1061             ty               = _mm256_mul_ps(fscal,dy00);
1062             tz               = _mm256_mul_ps(fscal,dz00);
1063
1064             /* Update vectorial force */
1065             fix0             = _mm256_add_ps(fix0,tx);
1066             fiy0             = _mm256_add_ps(fiy0,ty);
1067             fiz0             = _mm256_add_ps(fiz0,tz);
1068
1069             fjx0             = _mm256_add_ps(fjx0,tx);
1070             fjy0             = _mm256_add_ps(fjy0,ty);
1071             fjz0             = _mm256_add_ps(fjz0,tz);
1072
1073             }
1074
1075             /**************************
1076              * CALCULATE INTERACTIONS *
1077              **************************/
1078
1079             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1080             {
1081
1082             r01              = _mm256_mul_ps(rsq01,rinv01);
1083             r01              = _mm256_andnot_ps(dummy_mask,r01);
1084
1085             /* EWALD ELECTROSTATICS */
1086             
1087             /* Analytical PME correction */
1088             zeta2            = _mm256_mul_ps(beta2,rsq01);
1089             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1090             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1091             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1092             felec            = _mm256_mul_ps(qq01,felec);
1093             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1094             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1095             velec            = _mm256_sub_ps(rinv01,pmecorrV);
1096             velec            = _mm256_mul_ps(qq01,velec);
1097             
1098             d                = _mm256_sub_ps(r01,rswitch);
1099             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1100             d2               = _mm256_mul_ps(d,d);
1101             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1102
1103             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1104
1105             /* Evaluate switch function */
1106             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1107             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
1108             velec            = _mm256_mul_ps(velec,sw);
1109             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1110
1111             /* Update potential sum for this i atom from the interaction with this j atom. */
1112             velec            = _mm256_and_ps(velec,cutoff_mask);
1113             velec            = _mm256_andnot_ps(dummy_mask,velec);
1114             velecsum         = _mm256_add_ps(velecsum,velec);
1115
1116             fscal            = felec;
1117
1118             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1119
1120             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1121
1122             /* Calculate temporary vectorial force */
1123             tx               = _mm256_mul_ps(fscal,dx01);
1124             ty               = _mm256_mul_ps(fscal,dy01);
1125             tz               = _mm256_mul_ps(fscal,dz01);
1126
1127             /* Update vectorial force */
1128             fix0             = _mm256_add_ps(fix0,tx);
1129             fiy0             = _mm256_add_ps(fiy0,ty);
1130             fiz0             = _mm256_add_ps(fiz0,tz);
1131
1132             fjx1             = _mm256_add_ps(fjx1,tx);
1133             fjy1             = _mm256_add_ps(fjy1,ty);
1134             fjz1             = _mm256_add_ps(fjz1,tz);
1135
1136             }
1137
1138             /**************************
1139              * CALCULATE INTERACTIONS *
1140              **************************/
1141
1142             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1143             {
1144
1145             r02              = _mm256_mul_ps(rsq02,rinv02);
1146             r02              = _mm256_andnot_ps(dummy_mask,r02);
1147
1148             /* EWALD ELECTROSTATICS */
1149             
1150             /* Analytical PME correction */
1151             zeta2            = _mm256_mul_ps(beta2,rsq02);
1152             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1153             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1154             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1155             felec            = _mm256_mul_ps(qq02,felec);
1156             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1157             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1158             velec            = _mm256_sub_ps(rinv02,pmecorrV);
1159             velec            = _mm256_mul_ps(qq02,velec);
1160             
1161             d                = _mm256_sub_ps(r02,rswitch);
1162             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1163             d2               = _mm256_mul_ps(d,d);
1164             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1165
1166             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1167
1168             /* Evaluate switch function */
1169             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1170             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
1171             velec            = _mm256_mul_ps(velec,sw);
1172             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1173
1174             /* Update potential sum for this i atom from the interaction with this j atom. */
1175             velec            = _mm256_and_ps(velec,cutoff_mask);
1176             velec            = _mm256_andnot_ps(dummy_mask,velec);
1177             velecsum         = _mm256_add_ps(velecsum,velec);
1178
1179             fscal            = felec;
1180
1181             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1182
1183             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1184
1185             /* Calculate temporary vectorial force */
1186             tx               = _mm256_mul_ps(fscal,dx02);
1187             ty               = _mm256_mul_ps(fscal,dy02);
1188             tz               = _mm256_mul_ps(fscal,dz02);
1189
1190             /* Update vectorial force */
1191             fix0             = _mm256_add_ps(fix0,tx);
1192             fiy0             = _mm256_add_ps(fiy0,ty);
1193             fiz0             = _mm256_add_ps(fiz0,tz);
1194
1195             fjx2             = _mm256_add_ps(fjx2,tx);
1196             fjy2             = _mm256_add_ps(fjy2,ty);
1197             fjz2             = _mm256_add_ps(fjz2,tz);
1198
1199             }
1200
1201             /**************************
1202              * CALCULATE INTERACTIONS *
1203              **************************/
1204
1205             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1206             {
1207
1208             r10              = _mm256_mul_ps(rsq10,rinv10);
1209             r10              = _mm256_andnot_ps(dummy_mask,r10);
1210
1211             /* EWALD ELECTROSTATICS */
1212             
1213             /* Analytical PME correction */
1214             zeta2            = _mm256_mul_ps(beta2,rsq10);
1215             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1216             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1217             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1218             felec            = _mm256_mul_ps(qq10,felec);
1219             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1220             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1221             velec            = _mm256_sub_ps(rinv10,pmecorrV);
1222             velec            = _mm256_mul_ps(qq10,velec);
1223             
1224             d                = _mm256_sub_ps(r10,rswitch);
1225             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1226             d2               = _mm256_mul_ps(d,d);
1227             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1228
1229             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1230
1231             /* Evaluate switch function */
1232             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1233             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1234             velec            = _mm256_mul_ps(velec,sw);
1235             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1236
1237             /* Update potential sum for this i atom from the interaction with this j atom. */
1238             velec            = _mm256_and_ps(velec,cutoff_mask);
1239             velec            = _mm256_andnot_ps(dummy_mask,velec);
1240             velecsum         = _mm256_add_ps(velecsum,velec);
1241
1242             fscal            = felec;
1243
1244             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1245
1246             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1247
1248             /* Calculate temporary vectorial force */
1249             tx               = _mm256_mul_ps(fscal,dx10);
1250             ty               = _mm256_mul_ps(fscal,dy10);
1251             tz               = _mm256_mul_ps(fscal,dz10);
1252
1253             /* Update vectorial force */
1254             fix1             = _mm256_add_ps(fix1,tx);
1255             fiy1             = _mm256_add_ps(fiy1,ty);
1256             fiz1             = _mm256_add_ps(fiz1,tz);
1257
1258             fjx0             = _mm256_add_ps(fjx0,tx);
1259             fjy0             = _mm256_add_ps(fjy0,ty);
1260             fjz0             = _mm256_add_ps(fjz0,tz);
1261
1262             }
1263
1264             /**************************
1265              * CALCULATE INTERACTIONS *
1266              **************************/
1267
1268             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1269             {
1270
1271             r11              = _mm256_mul_ps(rsq11,rinv11);
1272             r11              = _mm256_andnot_ps(dummy_mask,r11);
1273
1274             /* EWALD ELECTROSTATICS */
1275             
1276             /* Analytical PME correction */
1277             zeta2            = _mm256_mul_ps(beta2,rsq11);
1278             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1279             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1280             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1281             felec            = _mm256_mul_ps(qq11,felec);
1282             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1283             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1284             velec            = _mm256_sub_ps(rinv11,pmecorrV);
1285             velec            = _mm256_mul_ps(qq11,velec);
1286             
1287             d                = _mm256_sub_ps(r11,rswitch);
1288             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1289             d2               = _mm256_mul_ps(d,d);
1290             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1291
1292             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1293
1294             /* Evaluate switch function */
1295             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1296             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1297             velec            = _mm256_mul_ps(velec,sw);
1298             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1299
1300             /* Update potential sum for this i atom from the interaction with this j atom. */
1301             velec            = _mm256_and_ps(velec,cutoff_mask);
1302             velec            = _mm256_andnot_ps(dummy_mask,velec);
1303             velecsum         = _mm256_add_ps(velecsum,velec);
1304
1305             fscal            = felec;
1306
1307             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1308
1309             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1310
1311             /* Calculate temporary vectorial force */
1312             tx               = _mm256_mul_ps(fscal,dx11);
1313             ty               = _mm256_mul_ps(fscal,dy11);
1314             tz               = _mm256_mul_ps(fscal,dz11);
1315
1316             /* Update vectorial force */
1317             fix1             = _mm256_add_ps(fix1,tx);
1318             fiy1             = _mm256_add_ps(fiy1,ty);
1319             fiz1             = _mm256_add_ps(fiz1,tz);
1320
1321             fjx1             = _mm256_add_ps(fjx1,tx);
1322             fjy1             = _mm256_add_ps(fjy1,ty);
1323             fjz1             = _mm256_add_ps(fjz1,tz);
1324
1325             }
1326
1327             /**************************
1328              * CALCULATE INTERACTIONS *
1329              **************************/
1330
1331             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1332             {
1333
1334             r12              = _mm256_mul_ps(rsq12,rinv12);
1335             r12              = _mm256_andnot_ps(dummy_mask,r12);
1336
1337             /* EWALD ELECTROSTATICS */
1338             
1339             /* Analytical PME correction */
1340             zeta2            = _mm256_mul_ps(beta2,rsq12);
1341             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1342             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1343             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1344             felec            = _mm256_mul_ps(qq12,felec);
1345             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1346             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1347             velec            = _mm256_sub_ps(rinv12,pmecorrV);
1348             velec            = _mm256_mul_ps(qq12,velec);
1349             
1350             d                = _mm256_sub_ps(r12,rswitch);
1351             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1352             d2               = _mm256_mul_ps(d,d);
1353             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1354
1355             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1356
1357             /* Evaluate switch function */
1358             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1359             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1360             velec            = _mm256_mul_ps(velec,sw);
1361             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1362
1363             /* Update potential sum for this i atom from the interaction with this j atom. */
1364             velec            = _mm256_and_ps(velec,cutoff_mask);
1365             velec            = _mm256_andnot_ps(dummy_mask,velec);
1366             velecsum         = _mm256_add_ps(velecsum,velec);
1367
1368             fscal            = felec;
1369
1370             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1371
1372             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1373
1374             /* Calculate temporary vectorial force */
1375             tx               = _mm256_mul_ps(fscal,dx12);
1376             ty               = _mm256_mul_ps(fscal,dy12);
1377             tz               = _mm256_mul_ps(fscal,dz12);
1378
1379             /* Update vectorial force */
1380             fix1             = _mm256_add_ps(fix1,tx);
1381             fiy1             = _mm256_add_ps(fiy1,ty);
1382             fiz1             = _mm256_add_ps(fiz1,tz);
1383
1384             fjx2             = _mm256_add_ps(fjx2,tx);
1385             fjy2             = _mm256_add_ps(fjy2,ty);
1386             fjz2             = _mm256_add_ps(fjz2,tz);
1387
1388             }
1389
1390             /**************************
1391              * CALCULATE INTERACTIONS *
1392              **************************/
1393
1394             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1395             {
1396
1397             r20              = _mm256_mul_ps(rsq20,rinv20);
1398             r20              = _mm256_andnot_ps(dummy_mask,r20);
1399
1400             /* EWALD ELECTROSTATICS */
1401             
1402             /* Analytical PME correction */
1403             zeta2            = _mm256_mul_ps(beta2,rsq20);
1404             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1405             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1406             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1407             felec            = _mm256_mul_ps(qq20,felec);
1408             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1409             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1410             velec            = _mm256_sub_ps(rinv20,pmecorrV);
1411             velec            = _mm256_mul_ps(qq20,velec);
1412             
1413             d                = _mm256_sub_ps(r20,rswitch);
1414             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1415             d2               = _mm256_mul_ps(d,d);
1416             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1417
1418             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1419
1420             /* Evaluate switch function */
1421             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1422             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1423             velec            = _mm256_mul_ps(velec,sw);
1424             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1425
1426             /* Update potential sum for this i atom from the interaction with this j atom. */
1427             velec            = _mm256_and_ps(velec,cutoff_mask);
1428             velec            = _mm256_andnot_ps(dummy_mask,velec);
1429             velecsum         = _mm256_add_ps(velecsum,velec);
1430
1431             fscal            = felec;
1432
1433             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1434
1435             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1436
1437             /* Calculate temporary vectorial force */
1438             tx               = _mm256_mul_ps(fscal,dx20);
1439             ty               = _mm256_mul_ps(fscal,dy20);
1440             tz               = _mm256_mul_ps(fscal,dz20);
1441
1442             /* Update vectorial force */
1443             fix2             = _mm256_add_ps(fix2,tx);
1444             fiy2             = _mm256_add_ps(fiy2,ty);
1445             fiz2             = _mm256_add_ps(fiz2,tz);
1446
1447             fjx0             = _mm256_add_ps(fjx0,tx);
1448             fjy0             = _mm256_add_ps(fjy0,ty);
1449             fjz0             = _mm256_add_ps(fjz0,tz);
1450
1451             }
1452
1453             /**************************
1454              * CALCULATE INTERACTIONS *
1455              **************************/
1456
1457             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1458             {
1459
1460             r21              = _mm256_mul_ps(rsq21,rinv21);
1461             r21              = _mm256_andnot_ps(dummy_mask,r21);
1462
1463             /* EWALD ELECTROSTATICS */
1464             
1465             /* Analytical PME correction */
1466             zeta2            = _mm256_mul_ps(beta2,rsq21);
1467             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1468             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1469             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1470             felec            = _mm256_mul_ps(qq21,felec);
1471             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1472             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1473             velec            = _mm256_sub_ps(rinv21,pmecorrV);
1474             velec            = _mm256_mul_ps(qq21,velec);
1475             
1476             d                = _mm256_sub_ps(r21,rswitch);
1477             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1478             d2               = _mm256_mul_ps(d,d);
1479             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1480
1481             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1482
1483             /* Evaluate switch function */
1484             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1485             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1486             velec            = _mm256_mul_ps(velec,sw);
1487             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1488
1489             /* Update potential sum for this i atom from the interaction with this j atom. */
1490             velec            = _mm256_and_ps(velec,cutoff_mask);
1491             velec            = _mm256_andnot_ps(dummy_mask,velec);
1492             velecsum         = _mm256_add_ps(velecsum,velec);
1493
1494             fscal            = felec;
1495
1496             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1497
1498             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1499
1500             /* Calculate temporary vectorial force */
1501             tx               = _mm256_mul_ps(fscal,dx21);
1502             ty               = _mm256_mul_ps(fscal,dy21);
1503             tz               = _mm256_mul_ps(fscal,dz21);
1504
1505             /* Update vectorial force */
1506             fix2             = _mm256_add_ps(fix2,tx);
1507             fiy2             = _mm256_add_ps(fiy2,ty);
1508             fiz2             = _mm256_add_ps(fiz2,tz);
1509
1510             fjx1             = _mm256_add_ps(fjx1,tx);
1511             fjy1             = _mm256_add_ps(fjy1,ty);
1512             fjz1             = _mm256_add_ps(fjz1,tz);
1513
1514             }
1515
1516             /**************************
1517              * CALCULATE INTERACTIONS *
1518              **************************/
1519
1520             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1521             {
1522
1523             r22              = _mm256_mul_ps(rsq22,rinv22);
1524             r22              = _mm256_andnot_ps(dummy_mask,r22);
1525
1526             /* EWALD ELECTROSTATICS */
1527             
1528             /* Analytical PME correction */
1529             zeta2            = _mm256_mul_ps(beta2,rsq22);
1530             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1531             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1532             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1533             felec            = _mm256_mul_ps(qq22,felec);
1534             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1535             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1536             velec            = _mm256_sub_ps(rinv22,pmecorrV);
1537             velec            = _mm256_mul_ps(qq22,velec);
1538             
1539             d                = _mm256_sub_ps(r22,rswitch);
1540             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1541             d2               = _mm256_mul_ps(d,d);
1542             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1543
1544             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1545
1546             /* Evaluate switch function */
1547             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1548             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1549             velec            = _mm256_mul_ps(velec,sw);
1550             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1551
1552             /* Update potential sum for this i atom from the interaction with this j atom. */
1553             velec            = _mm256_and_ps(velec,cutoff_mask);
1554             velec            = _mm256_andnot_ps(dummy_mask,velec);
1555             velecsum         = _mm256_add_ps(velecsum,velec);
1556
1557             fscal            = felec;
1558
1559             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1560
1561             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1562
1563             /* Calculate temporary vectorial force */
1564             tx               = _mm256_mul_ps(fscal,dx22);
1565             ty               = _mm256_mul_ps(fscal,dy22);
1566             tz               = _mm256_mul_ps(fscal,dz22);
1567
1568             /* Update vectorial force */
1569             fix2             = _mm256_add_ps(fix2,tx);
1570             fiy2             = _mm256_add_ps(fiy2,ty);
1571             fiz2             = _mm256_add_ps(fiz2,tz);
1572
1573             fjx2             = _mm256_add_ps(fjx2,tx);
1574             fjy2             = _mm256_add_ps(fjy2,ty);
1575             fjz2             = _mm256_add_ps(fjz2,tz);
1576
1577             }
1578
1579             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1580             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1581             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1582             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1583             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1584             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1585             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1586             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1587
1588             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1589                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1590
1591             /* Inner loop uses 999 flops */
1592         }
1593
1594         /* End of innermost loop */
1595
1596         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1597                                                  f+i_coord_offset,fshift+i_shift_offset);
1598
1599         ggid                        = gid[iidx];
1600         /* Update potential energies */
1601         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1602         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1603
1604         /* Increment number of inner iterations */
1605         inneriter                  += j_index_end - j_index_start;
1606
1607         /* Outer loop uses 20 flops */
1608     }
1609
1610     /* Increment number of outer iterations */
1611     outeriter        += nri;
1612
1613     /* Update outer/inner flops */
1614
1615     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*999);
1616 }
1617 /*
1618  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_single
1619  * Electrostatics interaction: Ewald
1620  * VdW interaction:            LennardJones
1621  * Geometry:                   Water3-Water3
1622  * Calculate force/pot:        Force
1623  */
1624 void
1625 nb_kernel_ElecEwSw_VdwLJSw_GeomW3W3_F_avx_256_single
1626                     (t_nblist * gmx_restrict                nlist,
1627                      rvec * gmx_restrict                    xx,
1628                      rvec * gmx_restrict                    ff,
1629                      t_forcerec * gmx_restrict              fr,
1630                      t_mdatoms * gmx_restrict               mdatoms,
1631                      nb_kernel_data_t * gmx_restrict        kernel_data,
1632                      t_nrnb * gmx_restrict                  nrnb)
1633 {
1634     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1635      * just 0 for non-waters.
1636      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1637      * jnr indices corresponding to data put in the four positions in the SIMD register.
1638      */
1639     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1640     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1641     int              jnrA,jnrB,jnrC,jnrD;
1642     int              jnrE,jnrF,jnrG,jnrH;
1643     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1644     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1645     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1646     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1647     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1648     real             rcutoff_scalar;
1649     real             *shiftvec,*fshift,*x,*f;
1650     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1651     real             scratch[4*DIM];
1652     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1653     real *           vdwioffsetptr0;
1654     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1655     real *           vdwioffsetptr1;
1656     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1657     real *           vdwioffsetptr2;
1658     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1659     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1660     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1661     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1662     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1663     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1664     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1665     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1666     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1667     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1668     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1669     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1670     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1671     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1672     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1673     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1674     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1675     real             *charge;
1676     int              nvdwtype;
1677     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1678     int              *vdwtype;
1679     real             *vdwparam;
1680     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1681     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1682     __m256i          ewitab;
1683     __m128i          ewitab_lo,ewitab_hi;
1684     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1685     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1686     real             *ewtab;
1687     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1688     real             rswitch_scalar,d_scalar;
1689     __m256           dummy_mask,cutoff_mask;
1690     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1691     __m256           one     = _mm256_set1_ps(1.0);
1692     __m256           two     = _mm256_set1_ps(2.0);
1693     x                = xx[0];
1694     f                = ff[0];
1695
1696     nri              = nlist->nri;
1697     iinr             = nlist->iinr;
1698     jindex           = nlist->jindex;
1699     jjnr             = nlist->jjnr;
1700     shiftidx         = nlist->shift;
1701     gid              = nlist->gid;
1702     shiftvec         = fr->shift_vec[0];
1703     fshift           = fr->fshift[0];
1704     facel            = _mm256_set1_ps(fr->epsfac);
1705     charge           = mdatoms->chargeA;
1706     nvdwtype         = fr->ntype;
1707     vdwparam         = fr->nbfp;
1708     vdwtype          = mdatoms->typeA;
1709
1710     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1711     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff);
1712     beta2            = _mm256_mul_ps(beta,beta);
1713     beta3            = _mm256_mul_ps(beta,beta2);
1714
1715     ewtab            = fr->ic->tabq_coul_FDV0;
1716     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1717     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1718
1719     /* Setup water-specific parameters */
1720     inr              = nlist->iinr[0];
1721     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1722     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1723     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1724     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1725
1726     jq0              = _mm256_set1_ps(charge[inr+0]);
1727     jq1              = _mm256_set1_ps(charge[inr+1]);
1728     jq2              = _mm256_set1_ps(charge[inr+2]);
1729     vdwjidx0A        = 2*vdwtype[inr+0];
1730     qq00             = _mm256_mul_ps(iq0,jq0);
1731     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1732     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1733     qq01             = _mm256_mul_ps(iq0,jq1);
1734     qq02             = _mm256_mul_ps(iq0,jq2);
1735     qq10             = _mm256_mul_ps(iq1,jq0);
1736     qq11             = _mm256_mul_ps(iq1,jq1);
1737     qq12             = _mm256_mul_ps(iq1,jq2);
1738     qq20             = _mm256_mul_ps(iq2,jq0);
1739     qq21             = _mm256_mul_ps(iq2,jq1);
1740     qq22             = _mm256_mul_ps(iq2,jq2);
1741
1742     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1743     rcutoff_scalar   = fr->rcoulomb;
1744     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1745     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1746
1747     rswitch_scalar   = fr->rcoulomb_switch;
1748     rswitch          = _mm256_set1_ps(rswitch_scalar);
1749     /* Setup switch parameters */
1750     d_scalar         = rcutoff_scalar-rswitch_scalar;
1751     d                = _mm256_set1_ps(d_scalar);
1752     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1753     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1754     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1755     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1756     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1757     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1758
1759     /* Avoid stupid compiler warnings */
1760     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1761     j_coord_offsetA = 0;
1762     j_coord_offsetB = 0;
1763     j_coord_offsetC = 0;
1764     j_coord_offsetD = 0;
1765     j_coord_offsetE = 0;
1766     j_coord_offsetF = 0;
1767     j_coord_offsetG = 0;
1768     j_coord_offsetH = 0;
1769
1770     outeriter        = 0;
1771     inneriter        = 0;
1772
1773     for(iidx=0;iidx<4*DIM;iidx++)
1774     {
1775         scratch[iidx] = 0.0;
1776     }
1777
1778     /* Start outer loop over neighborlists */
1779     for(iidx=0; iidx<nri; iidx++)
1780     {
1781         /* Load shift vector for this list */
1782         i_shift_offset   = DIM*shiftidx[iidx];
1783
1784         /* Load limits for loop over neighbors */
1785         j_index_start    = jindex[iidx];
1786         j_index_end      = jindex[iidx+1];
1787
1788         /* Get outer coordinate index */
1789         inr              = iinr[iidx];
1790         i_coord_offset   = DIM*inr;
1791
1792         /* Load i particle coords and add shift vector */
1793         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1794                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1795
1796         fix0             = _mm256_setzero_ps();
1797         fiy0             = _mm256_setzero_ps();
1798         fiz0             = _mm256_setzero_ps();
1799         fix1             = _mm256_setzero_ps();
1800         fiy1             = _mm256_setzero_ps();
1801         fiz1             = _mm256_setzero_ps();
1802         fix2             = _mm256_setzero_ps();
1803         fiy2             = _mm256_setzero_ps();
1804         fiz2             = _mm256_setzero_ps();
1805
1806         /* Start inner kernel loop */
1807         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1808         {
1809
1810             /* Get j neighbor index, and coordinate index */
1811             jnrA             = jjnr[jidx];
1812             jnrB             = jjnr[jidx+1];
1813             jnrC             = jjnr[jidx+2];
1814             jnrD             = jjnr[jidx+3];
1815             jnrE             = jjnr[jidx+4];
1816             jnrF             = jjnr[jidx+5];
1817             jnrG             = jjnr[jidx+6];
1818             jnrH             = jjnr[jidx+7];
1819             j_coord_offsetA  = DIM*jnrA;
1820             j_coord_offsetB  = DIM*jnrB;
1821             j_coord_offsetC  = DIM*jnrC;
1822             j_coord_offsetD  = DIM*jnrD;
1823             j_coord_offsetE  = DIM*jnrE;
1824             j_coord_offsetF  = DIM*jnrF;
1825             j_coord_offsetG  = DIM*jnrG;
1826             j_coord_offsetH  = DIM*jnrH;
1827
1828             /* load j atom coordinates */
1829             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1830                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1831                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1832                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1833                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1834
1835             /* Calculate displacement vector */
1836             dx00             = _mm256_sub_ps(ix0,jx0);
1837             dy00             = _mm256_sub_ps(iy0,jy0);
1838             dz00             = _mm256_sub_ps(iz0,jz0);
1839             dx01             = _mm256_sub_ps(ix0,jx1);
1840             dy01             = _mm256_sub_ps(iy0,jy1);
1841             dz01             = _mm256_sub_ps(iz0,jz1);
1842             dx02             = _mm256_sub_ps(ix0,jx2);
1843             dy02             = _mm256_sub_ps(iy0,jy2);
1844             dz02             = _mm256_sub_ps(iz0,jz2);
1845             dx10             = _mm256_sub_ps(ix1,jx0);
1846             dy10             = _mm256_sub_ps(iy1,jy0);
1847             dz10             = _mm256_sub_ps(iz1,jz0);
1848             dx11             = _mm256_sub_ps(ix1,jx1);
1849             dy11             = _mm256_sub_ps(iy1,jy1);
1850             dz11             = _mm256_sub_ps(iz1,jz1);
1851             dx12             = _mm256_sub_ps(ix1,jx2);
1852             dy12             = _mm256_sub_ps(iy1,jy2);
1853             dz12             = _mm256_sub_ps(iz1,jz2);
1854             dx20             = _mm256_sub_ps(ix2,jx0);
1855             dy20             = _mm256_sub_ps(iy2,jy0);
1856             dz20             = _mm256_sub_ps(iz2,jz0);
1857             dx21             = _mm256_sub_ps(ix2,jx1);
1858             dy21             = _mm256_sub_ps(iy2,jy1);
1859             dz21             = _mm256_sub_ps(iz2,jz1);
1860             dx22             = _mm256_sub_ps(ix2,jx2);
1861             dy22             = _mm256_sub_ps(iy2,jy2);
1862             dz22             = _mm256_sub_ps(iz2,jz2);
1863
1864             /* Calculate squared distance and things based on it */
1865             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1866             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1867             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1868             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1869             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1870             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1871             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1872             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1873             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1874
1875             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1876             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1877             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1878             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1879             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1880             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1881             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1882             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1883             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1884
1885             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1886             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1887             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1888             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1889             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1890             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1891             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1892             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1893             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1894
1895             fjx0             = _mm256_setzero_ps();
1896             fjy0             = _mm256_setzero_ps();
1897             fjz0             = _mm256_setzero_ps();
1898             fjx1             = _mm256_setzero_ps();
1899             fjy1             = _mm256_setzero_ps();
1900             fjz1             = _mm256_setzero_ps();
1901             fjx2             = _mm256_setzero_ps();
1902             fjy2             = _mm256_setzero_ps();
1903             fjz2             = _mm256_setzero_ps();
1904
1905             /**************************
1906              * CALCULATE INTERACTIONS *
1907              **************************/
1908
1909             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1910             {
1911
1912             r00              = _mm256_mul_ps(rsq00,rinv00);
1913
1914             /* EWALD ELECTROSTATICS */
1915             
1916             /* Analytical PME correction */
1917             zeta2            = _mm256_mul_ps(beta2,rsq00);
1918             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1919             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1920             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1921             felec            = _mm256_mul_ps(qq00,felec);
1922             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1923             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1924             velec            = _mm256_sub_ps(rinv00,pmecorrV);
1925             velec            = _mm256_mul_ps(qq00,velec);
1926             
1927             /* LENNARD-JONES DISPERSION/REPULSION */
1928
1929             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1930             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
1931             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
1932             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
1933             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
1934
1935             d                = _mm256_sub_ps(r00,rswitch);
1936             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1937             d2               = _mm256_mul_ps(d,d);
1938             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1939
1940             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1941
1942             /* Evaluate switch function */
1943             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1944             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1945             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
1946             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1947
1948             fscal            = _mm256_add_ps(felec,fvdw);
1949
1950             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1951
1952             /* Calculate temporary vectorial force */
1953             tx               = _mm256_mul_ps(fscal,dx00);
1954             ty               = _mm256_mul_ps(fscal,dy00);
1955             tz               = _mm256_mul_ps(fscal,dz00);
1956
1957             /* Update vectorial force */
1958             fix0             = _mm256_add_ps(fix0,tx);
1959             fiy0             = _mm256_add_ps(fiy0,ty);
1960             fiz0             = _mm256_add_ps(fiz0,tz);
1961
1962             fjx0             = _mm256_add_ps(fjx0,tx);
1963             fjy0             = _mm256_add_ps(fjy0,ty);
1964             fjz0             = _mm256_add_ps(fjz0,tz);
1965
1966             }
1967
1968             /**************************
1969              * CALCULATE INTERACTIONS *
1970              **************************/
1971
1972             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1973             {
1974
1975             r01              = _mm256_mul_ps(rsq01,rinv01);
1976
1977             /* EWALD ELECTROSTATICS */
1978             
1979             /* Analytical PME correction */
1980             zeta2            = _mm256_mul_ps(beta2,rsq01);
1981             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1982             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1983             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1984             felec            = _mm256_mul_ps(qq01,felec);
1985             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1986             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1987             velec            = _mm256_sub_ps(rinv01,pmecorrV);
1988             velec            = _mm256_mul_ps(qq01,velec);
1989             
1990             d                = _mm256_sub_ps(r01,rswitch);
1991             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1992             d2               = _mm256_mul_ps(d,d);
1993             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1994
1995             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1996
1997             /* Evaluate switch function */
1998             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1999             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
2000             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2001
2002             fscal            = felec;
2003
2004             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2005
2006             /* Calculate temporary vectorial force */
2007             tx               = _mm256_mul_ps(fscal,dx01);
2008             ty               = _mm256_mul_ps(fscal,dy01);
2009             tz               = _mm256_mul_ps(fscal,dz01);
2010
2011             /* Update vectorial force */
2012             fix0             = _mm256_add_ps(fix0,tx);
2013             fiy0             = _mm256_add_ps(fiy0,ty);
2014             fiz0             = _mm256_add_ps(fiz0,tz);
2015
2016             fjx1             = _mm256_add_ps(fjx1,tx);
2017             fjy1             = _mm256_add_ps(fjy1,ty);
2018             fjz1             = _mm256_add_ps(fjz1,tz);
2019
2020             }
2021
2022             /**************************
2023              * CALCULATE INTERACTIONS *
2024              **************************/
2025
2026             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2027             {
2028
2029             r02              = _mm256_mul_ps(rsq02,rinv02);
2030
2031             /* EWALD ELECTROSTATICS */
2032             
2033             /* Analytical PME correction */
2034             zeta2            = _mm256_mul_ps(beta2,rsq02);
2035             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
2036             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2037             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2038             felec            = _mm256_mul_ps(qq02,felec);
2039             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2040             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2041             velec            = _mm256_sub_ps(rinv02,pmecorrV);
2042             velec            = _mm256_mul_ps(qq02,velec);
2043             
2044             d                = _mm256_sub_ps(r02,rswitch);
2045             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2046             d2               = _mm256_mul_ps(d,d);
2047             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2048
2049             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2050
2051             /* Evaluate switch function */
2052             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2053             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
2054             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2055
2056             fscal            = felec;
2057
2058             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2059
2060             /* Calculate temporary vectorial force */
2061             tx               = _mm256_mul_ps(fscal,dx02);
2062             ty               = _mm256_mul_ps(fscal,dy02);
2063             tz               = _mm256_mul_ps(fscal,dz02);
2064
2065             /* Update vectorial force */
2066             fix0             = _mm256_add_ps(fix0,tx);
2067             fiy0             = _mm256_add_ps(fiy0,ty);
2068             fiz0             = _mm256_add_ps(fiz0,tz);
2069
2070             fjx2             = _mm256_add_ps(fjx2,tx);
2071             fjy2             = _mm256_add_ps(fjy2,ty);
2072             fjz2             = _mm256_add_ps(fjz2,tz);
2073
2074             }
2075
2076             /**************************
2077              * CALCULATE INTERACTIONS *
2078              **************************/
2079
2080             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2081             {
2082
2083             r10              = _mm256_mul_ps(rsq10,rinv10);
2084
2085             /* EWALD ELECTROSTATICS */
2086             
2087             /* Analytical PME correction */
2088             zeta2            = _mm256_mul_ps(beta2,rsq10);
2089             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2090             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2091             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2092             felec            = _mm256_mul_ps(qq10,felec);
2093             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2094             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2095             velec            = _mm256_sub_ps(rinv10,pmecorrV);
2096             velec            = _mm256_mul_ps(qq10,velec);
2097             
2098             d                = _mm256_sub_ps(r10,rswitch);
2099             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2100             d2               = _mm256_mul_ps(d,d);
2101             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2102
2103             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2104
2105             /* Evaluate switch function */
2106             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2107             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2108             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2109
2110             fscal            = felec;
2111
2112             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2113
2114             /* Calculate temporary vectorial force */
2115             tx               = _mm256_mul_ps(fscal,dx10);
2116             ty               = _mm256_mul_ps(fscal,dy10);
2117             tz               = _mm256_mul_ps(fscal,dz10);
2118
2119             /* Update vectorial force */
2120             fix1             = _mm256_add_ps(fix1,tx);
2121             fiy1             = _mm256_add_ps(fiy1,ty);
2122             fiz1             = _mm256_add_ps(fiz1,tz);
2123
2124             fjx0             = _mm256_add_ps(fjx0,tx);
2125             fjy0             = _mm256_add_ps(fjy0,ty);
2126             fjz0             = _mm256_add_ps(fjz0,tz);
2127
2128             }
2129
2130             /**************************
2131              * CALCULATE INTERACTIONS *
2132              **************************/
2133
2134             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2135             {
2136
2137             r11              = _mm256_mul_ps(rsq11,rinv11);
2138
2139             /* EWALD ELECTROSTATICS */
2140             
2141             /* Analytical PME correction */
2142             zeta2            = _mm256_mul_ps(beta2,rsq11);
2143             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2144             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2145             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2146             felec            = _mm256_mul_ps(qq11,felec);
2147             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2148             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2149             velec            = _mm256_sub_ps(rinv11,pmecorrV);
2150             velec            = _mm256_mul_ps(qq11,velec);
2151             
2152             d                = _mm256_sub_ps(r11,rswitch);
2153             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2154             d2               = _mm256_mul_ps(d,d);
2155             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2156
2157             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2158
2159             /* Evaluate switch function */
2160             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2161             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2162             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2163
2164             fscal            = felec;
2165
2166             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2167
2168             /* Calculate temporary vectorial force */
2169             tx               = _mm256_mul_ps(fscal,dx11);
2170             ty               = _mm256_mul_ps(fscal,dy11);
2171             tz               = _mm256_mul_ps(fscal,dz11);
2172
2173             /* Update vectorial force */
2174             fix1             = _mm256_add_ps(fix1,tx);
2175             fiy1             = _mm256_add_ps(fiy1,ty);
2176             fiz1             = _mm256_add_ps(fiz1,tz);
2177
2178             fjx1             = _mm256_add_ps(fjx1,tx);
2179             fjy1             = _mm256_add_ps(fjy1,ty);
2180             fjz1             = _mm256_add_ps(fjz1,tz);
2181
2182             }
2183
2184             /**************************
2185              * CALCULATE INTERACTIONS *
2186              **************************/
2187
2188             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2189             {
2190
2191             r12              = _mm256_mul_ps(rsq12,rinv12);
2192
2193             /* EWALD ELECTROSTATICS */
2194             
2195             /* Analytical PME correction */
2196             zeta2            = _mm256_mul_ps(beta2,rsq12);
2197             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2198             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2199             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2200             felec            = _mm256_mul_ps(qq12,felec);
2201             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2202             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2203             velec            = _mm256_sub_ps(rinv12,pmecorrV);
2204             velec            = _mm256_mul_ps(qq12,velec);
2205             
2206             d                = _mm256_sub_ps(r12,rswitch);
2207             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2208             d2               = _mm256_mul_ps(d,d);
2209             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2210
2211             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2212
2213             /* Evaluate switch function */
2214             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2215             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2216             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2217
2218             fscal            = felec;
2219
2220             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2221
2222             /* Calculate temporary vectorial force */
2223             tx               = _mm256_mul_ps(fscal,dx12);
2224             ty               = _mm256_mul_ps(fscal,dy12);
2225             tz               = _mm256_mul_ps(fscal,dz12);
2226
2227             /* Update vectorial force */
2228             fix1             = _mm256_add_ps(fix1,tx);
2229             fiy1             = _mm256_add_ps(fiy1,ty);
2230             fiz1             = _mm256_add_ps(fiz1,tz);
2231
2232             fjx2             = _mm256_add_ps(fjx2,tx);
2233             fjy2             = _mm256_add_ps(fjy2,ty);
2234             fjz2             = _mm256_add_ps(fjz2,tz);
2235
2236             }
2237
2238             /**************************
2239              * CALCULATE INTERACTIONS *
2240              **************************/
2241
2242             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2243             {
2244
2245             r20              = _mm256_mul_ps(rsq20,rinv20);
2246
2247             /* EWALD ELECTROSTATICS */
2248             
2249             /* Analytical PME correction */
2250             zeta2            = _mm256_mul_ps(beta2,rsq20);
2251             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2252             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2253             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2254             felec            = _mm256_mul_ps(qq20,felec);
2255             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2256             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2257             velec            = _mm256_sub_ps(rinv20,pmecorrV);
2258             velec            = _mm256_mul_ps(qq20,velec);
2259             
2260             d                = _mm256_sub_ps(r20,rswitch);
2261             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2262             d2               = _mm256_mul_ps(d,d);
2263             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2264
2265             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2266
2267             /* Evaluate switch function */
2268             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2269             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2270             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2271
2272             fscal            = felec;
2273
2274             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2275
2276             /* Calculate temporary vectorial force */
2277             tx               = _mm256_mul_ps(fscal,dx20);
2278             ty               = _mm256_mul_ps(fscal,dy20);
2279             tz               = _mm256_mul_ps(fscal,dz20);
2280
2281             /* Update vectorial force */
2282             fix2             = _mm256_add_ps(fix2,tx);
2283             fiy2             = _mm256_add_ps(fiy2,ty);
2284             fiz2             = _mm256_add_ps(fiz2,tz);
2285
2286             fjx0             = _mm256_add_ps(fjx0,tx);
2287             fjy0             = _mm256_add_ps(fjy0,ty);
2288             fjz0             = _mm256_add_ps(fjz0,tz);
2289
2290             }
2291
2292             /**************************
2293              * CALCULATE INTERACTIONS *
2294              **************************/
2295
2296             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2297             {
2298
2299             r21              = _mm256_mul_ps(rsq21,rinv21);
2300
2301             /* EWALD ELECTROSTATICS */
2302             
2303             /* Analytical PME correction */
2304             zeta2            = _mm256_mul_ps(beta2,rsq21);
2305             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2306             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2307             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2308             felec            = _mm256_mul_ps(qq21,felec);
2309             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2310             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2311             velec            = _mm256_sub_ps(rinv21,pmecorrV);
2312             velec            = _mm256_mul_ps(qq21,velec);
2313             
2314             d                = _mm256_sub_ps(r21,rswitch);
2315             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2316             d2               = _mm256_mul_ps(d,d);
2317             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2318
2319             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2320
2321             /* Evaluate switch function */
2322             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2323             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2324             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2325
2326             fscal            = felec;
2327
2328             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2329
2330             /* Calculate temporary vectorial force */
2331             tx               = _mm256_mul_ps(fscal,dx21);
2332             ty               = _mm256_mul_ps(fscal,dy21);
2333             tz               = _mm256_mul_ps(fscal,dz21);
2334
2335             /* Update vectorial force */
2336             fix2             = _mm256_add_ps(fix2,tx);
2337             fiy2             = _mm256_add_ps(fiy2,ty);
2338             fiz2             = _mm256_add_ps(fiz2,tz);
2339
2340             fjx1             = _mm256_add_ps(fjx1,tx);
2341             fjy1             = _mm256_add_ps(fjy1,ty);
2342             fjz1             = _mm256_add_ps(fjz1,tz);
2343
2344             }
2345
2346             /**************************
2347              * CALCULATE INTERACTIONS *
2348              **************************/
2349
2350             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2351             {
2352
2353             r22              = _mm256_mul_ps(rsq22,rinv22);
2354
2355             /* EWALD ELECTROSTATICS */
2356             
2357             /* Analytical PME correction */
2358             zeta2            = _mm256_mul_ps(beta2,rsq22);
2359             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2360             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2361             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2362             felec            = _mm256_mul_ps(qq22,felec);
2363             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2364             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2365             velec            = _mm256_sub_ps(rinv22,pmecorrV);
2366             velec            = _mm256_mul_ps(qq22,velec);
2367             
2368             d                = _mm256_sub_ps(r22,rswitch);
2369             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2370             d2               = _mm256_mul_ps(d,d);
2371             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2372
2373             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2374
2375             /* Evaluate switch function */
2376             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2377             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2378             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2379
2380             fscal            = felec;
2381
2382             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2383
2384             /* Calculate temporary vectorial force */
2385             tx               = _mm256_mul_ps(fscal,dx22);
2386             ty               = _mm256_mul_ps(fscal,dy22);
2387             tz               = _mm256_mul_ps(fscal,dz22);
2388
2389             /* Update vectorial force */
2390             fix2             = _mm256_add_ps(fix2,tx);
2391             fiy2             = _mm256_add_ps(fiy2,ty);
2392             fiz2             = _mm256_add_ps(fiz2,tz);
2393
2394             fjx2             = _mm256_add_ps(fjx2,tx);
2395             fjy2             = _mm256_add_ps(fjy2,ty);
2396             fjz2             = _mm256_add_ps(fjz2,tz);
2397
2398             }
2399
2400             fjptrA             = f+j_coord_offsetA;
2401             fjptrB             = f+j_coord_offsetB;
2402             fjptrC             = f+j_coord_offsetC;
2403             fjptrD             = f+j_coord_offsetD;
2404             fjptrE             = f+j_coord_offsetE;
2405             fjptrF             = f+j_coord_offsetF;
2406             fjptrG             = f+j_coord_offsetG;
2407             fjptrH             = f+j_coord_offsetH;
2408
2409             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2410                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2411
2412             /* Inner loop uses 960 flops */
2413         }
2414
2415         if(jidx<j_index_end)
2416         {
2417
2418             /* Get j neighbor index, and coordinate index */
2419             jnrlistA         = jjnr[jidx];
2420             jnrlistB         = jjnr[jidx+1];
2421             jnrlistC         = jjnr[jidx+2];
2422             jnrlistD         = jjnr[jidx+3];
2423             jnrlistE         = jjnr[jidx+4];
2424             jnrlistF         = jjnr[jidx+5];
2425             jnrlistG         = jjnr[jidx+6];
2426             jnrlistH         = jjnr[jidx+7];
2427             /* Sign of each element will be negative for non-real atoms.
2428              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2429              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2430              */
2431             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2432                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2433                                             
2434             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2435             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2436             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2437             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2438             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2439             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2440             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2441             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2442             j_coord_offsetA  = DIM*jnrA;
2443             j_coord_offsetB  = DIM*jnrB;
2444             j_coord_offsetC  = DIM*jnrC;
2445             j_coord_offsetD  = DIM*jnrD;
2446             j_coord_offsetE  = DIM*jnrE;
2447             j_coord_offsetF  = DIM*jnrF;
2448             j_coord_offsetG  = DIM*jnrG;
2449             j_coord_offsetH  = DIM*jnrH;
2450
2451             /* load j atom coordinates */
2452             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2453                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2454                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2455                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2456                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2457
2458             /* Calculate displacement vector */
2459             dx00             = _mm256_sub_ps(ix0,jx0);
2460             dy00             = _mm256_sub_ps(iy0,jy0);
2461             dz00             = _mm256_sub_ps(iz0,jz0);
2462             dx01             = _mm256_sub_ps(ix0,jx1);
2463             dy01             = _mm256_sub_ps(iy0,jy1);
2464             dz01             = _mm256_sub_ps(iz0,jz1);
2465             dx02             = _mm256_sub_ps(ix0,jx2);
2466             dy02             = _mm256_sub_ps(iy0,jy2);
2467             dz02             = _mm256_sub_ps(iz0,jz2);
2468             dx10             = _mm256_sub_ps(ix1,jx0);
2469             dy10             = _mm256_sub_ps(iy1,jy0);
2470             dz10             = _mm256_sub_ps(iz1,jz0);
2471             dx11             = _mm256_sub_ps(ix1,jx1);
2472             dy11             = _mm256_sub_ps(iy1,jy1);
2473             dz11             = _mm256_sub_ps(iz1,jz1);
2474             dx12             = _mm256_sub_ps(ix1,jx2);
2475             dy12             = _mm256_sub_ps(iy1,jy2);
2476             dz12             = _mm256_sub_ps(iz1,jz2);
2477             dx20             = _mm256_sub_ps(ix2,jx0);
2478             dy20             = _mm256_sub_ps(iy2,jy0);
2479             dz20             = _mm256_sub_ps(iz2,jz0);
2480             dx21             = _mm256_sub_ps(ix2,jx1);
2481             dy21             = _mm256_sub_ps(iy2,jy1);
2482             dz21             = _mm256_sub_ps(iz2,jz1);
2483             dx22             = _mm256_sub_ps(ix2,jx2);
2484             dy22             = _mm256_sub_ps(iy2,jy2);
2485             dz22             = _mm256_sub_ps(iz2,jz2);
2486
2487             /* Calculate squared distance and things based on it */
2488             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2489             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2490             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2491             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2492             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2493             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2494             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2495             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2496             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2497
2498             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2499             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
2500             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
2501             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
2502             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2503             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2504             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
2505             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2506             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2507
2508             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
2509             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
2510             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
2511             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
2512             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2513             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2514             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
2515             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2516             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2517
2518             fjx0             = _mm256_setzero_ps();
2519             fjy0             = _mm256_setzero_ps();
2520             fjz0             = _mm256_setzero_ps();
2521             fjx1             = _mm256_setzero_ps();
2522             fjy1             = _mm256_setzero_ps();
2523             fjz1             = _mm256_setzero_ps();
2524             fjx2             = _mm256_setzero_ps();
2525             fjy2             = _mm256_setzero_ps();
2526             fjz2             = _mm256_setzero_ps();
2527
2528             /**************************
2529              * CALCULATE INTERACTIONS *
2530              **************************/
2531
2532             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2533             {
2534
2535             r00              = _mm256_mul_ps(rsq00,rinv00);
2536             r00              = _mm256_andnot_ps(dummy_mask,r00);
2537
2538             /* EWALD ELECTROSTATICS */
2539             
2540             /* Analytical PME correction */
2541             zeta2            = _mm256_mul_ps(beta2,rsq00);
2542             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
2543             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2544             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2545             felec            = _mm256_mul_ps(qq00,felec);
2546             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2547             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2548             velec            = _mm256_sub_ps(rinv00,pmecorrV);
2549             velec            = _mm256_mul_ps(qq00,velec);
2550             
2551             /* LENNARD-JONES DISPERSION/REPULSION */
2552
2553             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2554             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
2555             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
2556             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
2557             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
2558
2559             d                = _mm256_sub_ps(r00,rswitch);
2560             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2561             d2               = _mm256_mul_ps(d,d);
2562             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2563
2564             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2565
2566             /* Evaluate switch function */
2567             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2568             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
2569             fvdw             = _mm256_sub_ps( _mm256_mul_ps(fvdw,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(vvdw,dsw)) );
2570             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2571
2572             fscal            = _mm256_add_ps(felec,fvdw);
2573
2574             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2575
2576             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2577
2578             /* Calculate temporary vectorial force */
2579             tx               = _mm256_mul_ps(fscal,dx00);
2580             ty               = _mm256_mul_ps(fscal,dy00);
2581             tz               = _mm256_mul_ps(fscal,dz00);
2582
2583             /* Update vectorial force */
2584             fix0             = _mm256_add_ps(fix0,tx);
2585             fiy0             = _mm256_add_ps(fiy0,ty);
2586             fiz0             = _mm256_add_ps(fiz0,tz);
2587
2588             fjx0             = _mm256_add_ps(fjx0,tx);
2589             fjy0             = _mm256_add_ps(fjy0,ty);
2590             fjz0             = _mm256_add_ps(fjz0,tz);
2591
2592             }
2593
2594             /**************************
2595              * CALCULATE INTERACTIONS *
2596              **************************/
2597
2598             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2599             {
2600
2601             r01              = _mm256_mul_ps(rsq01,rinv01);
2602             r01              = _mm256_andnot_ps(dummy_mask,r01);
2603
2604             /* EWALD ELECTROSTATICS */
2605             
2606             /* Analytical PME correction */
2607             zeta2            = _mm256_mul_ps(beta2,rsq01);
2608             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
2609             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2610             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2611             felec            = _mm256_mul_ps(qq01,felec);
2612             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2613             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2614             velec            = _mm256_sub_ps(rinv01,pmecorrV);
2615             velec            = _mm256_mul_ps(qq01,velec);
2616             
2617             d                = _mm256_sub_ps(r01,rswitch);
2618             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2619             d2               = _mm256_mul_ps(d,d);
2620             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2621
2622             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2623
2624             /* Evaluate switch function */
2625             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2626             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
2627             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2628
2629             fscal            = felec;
2630
2631             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2632
2633             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2634
2635             /* Calculate temporary vectorial force */
2636             tx               = _mm256_mul_ps(fscal,dx01);
2637             ty               = _mm256_mul_ps(fscal,dy01);
2638             tz               = _mm256_mul_ps(fscal,dz01);
2639
2640             /* Update vectorial force */
2641             fix0             = _mm256_add_ps(fix0,tx);
2642             fiy0             = _mm256_add_ps(fiy0,ty);
2643             fiz0             = _mm256_add_ps(fiz0,tz);
2644
2645             fjx1             = _mm256_add_ps(fjx1,tx);
2646             fjy1             = _mm256_add_ps(fjy1,ty);
2647             fjz1             = _mm256_add_ps(fjz1,tz);
2648
2649             }
2650
2651             /**************************
2652              * CALCULATE INTERACTIONS *
2653              **************************/
2654
2655             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2656             {
2657
2658             r02              = _mm256_mul_ps(rsq02,rinv02);
2659             r02              = _mm256_andnot_ps(dummy_mask,r02);
2660
2661             /* EWALD ELECTROSTATICS */
2662             
2663             /* Analytical PME correction */
2664             zeta2            = _mm256_mul_ps(beta2,rsq02);
2665             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
2666             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2667             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2668             felec            = _mm256_mul_ps(qq02,felec);
2669             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2670             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2671             velec            = _mm256_sub_ps(rinv02,pmecorrV);
2672             velec            = _mm256_mul_ps(qq02,velec);
2673             
2674             d                = _mm256_sub_ps(r02,rswitch);
2675             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2676             d2               = _mm256_mul_ps(d,d);
2677             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2678
2679             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2680
2681             /* Evaluate switch function */
2682             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2683             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
2684             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2685
2686             fscal            = felec;
2687
2688             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2689
2690             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2691
2692             /* Calculate temporary vectorial force */
2693             tx               = _mm256_mul_ps(fscal,dx02);
2694             ty               = _mm256_mul_ps(fscal,dy02);
2695             tz               = _mm256_mul_ps(fscal,dz02);
2696
2697             /* Update vectorial force */
2698             fix0             = _mm256_add_ps(fix0,tx);
2699             fiy0             = _mm256_add_ps(fiy0,ty);
2700             fiz0             = _mm256_add_ps(fiz0,tz);
2701
2702             fjx2             = _mm256_add_ps(fjx2,tx);
2703             fjy2             = _mm256_add_ps(fjy2,ty);
2704             fjz2             = _mm256_add_ps(fjz2,tz);
2705
2706             }
2707
2708             /**************************
2709              * CALCULATE INTERACTIONS *
2710              **************************/
2711
2712             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2713             {
2714
2715             r10              = _mm256_mul_ps(rsq10,rinv10);
2716             r10              = _mm256_andnot_ps(dummy_mask,r10);
2717
2718             /* EWALD ELECTROSTATICS */
2719             
2720             /* Analytical PME correction */
2721             zeta2            = _mm256_mul_ps(beta2,rsq10);
2722             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2723             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2724             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2725             felec            = _mm256_mul_ps(qq10,felec);
2726             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2727             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2728             velec            = _mm256_sub_ps(rinv10,pmecorrV);
2729             velec            = _mm256_mul_ps(qq10,velec);
2730             
2731             d                = _mm256_sub_ps(r10,rswitch);
2732             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2733             d2               = _mm256_mul_ps(d,d);
2734             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2735
2736             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2737
2738             /* Evaluate switch function */
2739             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2740             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2741             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2742
2743             fscal            = felec;
2744
2745             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2746
2747             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2748
2749             /* Calculate temporary vectorial force */
2750             tx               = _mm256_mul_ps(fscal,dx10);
2751             ty               = _mm256_mul_ps(fscal,dy10);
2752             tz               = _mm256_mul_ps(fscal,dz10);
2753
2754             /* Update vectorial force */
2755             fix1             = _mm256_add_ps(fix1,tx);
2756             fiy1             = _mm256_add_ps(fiy1,ty);
2757             fiz1             = _mm256_add_ps(fiz1,tz);
2758
2759             fjx0             = _mm256_add_ps(fjx0,tx);
2760             fjy0             = _mm256_add_ps(fjy0,ty);
2761             fjz0             = _mm256_add_ps(fjz0,tz);
2762
2763             }
2764
2765             /**************************
2766              * CALCULATE INTERACTIONS *
2767              **************************/
2768
2769             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2770             {
2771
2772             r11              = _mm256_mul_ps(rsq11,rinv11);
2773             r11              = _mm256_andnot_ps(dummy_mask,r11);
2774
2775             /* EWALD ELECTROSTATICS */
2776             
2777             /* Analytical PME correction */
2778             zeta2            = _mm256_mul_ps(beta2,rsq11);
2779             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2780             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2781             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2782             felec            = _mm256_mul_ps(qq11,felec);
2783             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2784             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2785             velec            = _mm256_sub_ps(rinv11,pmecorrV);
2786             velec            = _mm256_mul_ps(qq11,velec);
2787             
2788             d                = _mm256_sub_ps(r11,rswitch);
2789             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2790             d2               = _mm256_mul_ps(d,d);
2791             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2792
2793             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2794
2795             /* Evaluate switch function */
2796             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2797             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2798             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2799
2800             fscal            = felec;
2801
2802             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2803
2804             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2805
2806             /* Calculate temporary vectorial force */
2807             tx               = _mm256_mul_ps(fscal,dx11);
2808             ty               = _mm256_mul_ps(fscal,dy11);
2809             tz               = _mm256_mul_ps(fscal,dz11);
2810
2811             /* Update vectorial force */
2812             fix1             = _mm256_add_ps(fix1,tx);
2813             fiy1             = _mm256_add_ps(fiy1,ty);
2814             fiz1             = _mm256_add_ps(fiz1,tz);
2815
2816             fjx1             = _mm256_add_ps(fjx1,tx);
2817             fjy1             = _mm256_add_ps(fjy1,ty);
2818             fjz1             = _mm256_add_ps(fjz1,tz);
2819
2820             }
2821
2822             /**************************
2823              * CALCULATE INTERACTIONS *
2824              **************************/
2825
2826             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2827             {
2828
2829             r12              = _mm256_mul_ps(rsq12,rinv12);
2830             r12              = _mm256_andnot_ps(dummy_mask,r12);
2831
2832             /* EWALD ELECTROSTATICS */
2833             
2834             /* Analytical PME correction */
2835             zeta2            = _mm256_mul_ps(beta2,rsq12);
2836             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2837             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2838             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2839             felec            = _mm256_mul_ps(qq12,felec);
2840             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2841             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2842             velec            = _mm256_sub_ps(rinv12,pmecorrV);
2843             velec            = _mm256_mul_ps(qq12,velec);
2844             
2845             d                = _mm256_sub_ps(r12,rswitch);
2846             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2847             d2               = _mm256_mul_ps(d,d);
2848             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2849
2850             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2851
2852             /* Evaluate switch function */
2853             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2854             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2855             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2856
2857             fscal            = felec;
2858
2859             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2860
2861             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2862
2863             /* Calculate temporary vectorial force */
2864             tx               = _mm256_mul_ps(fscal,dx12);
2865             ty               = _mm256_mul_ps(fscal,dy12);
2866             tz               = _mm256_mul_ps(fscal,dz12);
2867
2868             /* Update vectorial force */
2869             fix1             = _mm256_add_ps(fix1,tx);
2870             fiy1             = _mm256_add_ps(fiy1,ty);
2871             fiz1             = _mm256_add_ps(fiz1,tz);
2872
2873             fjx2             = _mm256_add_ps(fjx2,tx);
2874             fjy2             = _mm256_add_ps(fjy2,ty);
2875             fjz2             = _mm256_add_ps(fjz2,tz);
2876
2877             }
2878
2879             /**************************
2880              * CALCULATE INTERACTIONS *
2881              **************************/
2882
2883             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2884             {
2885
2886             r20              = _mm256_mul_ps(rsq20,rinv20);
2887             r20              = _mm256_andnot_ps(dummy_mask,r20);
2888
2889             /* EWALD ELECTROSTATICS */
2890             
2891             /* Analytical PME correction */
2892             zeta2            = _mm256_mul_ps(beta2,rsq20);
2893             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2894             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2895             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2896             felec            = _mm256_mul_ps(qq20,felec);
2897             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2898             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2899             velec            = _mm256_sub_ps(rinv20,pmecorrV);
2900             velec            = _mm256_mul_ps(qq20,velec);
2901             
2902             d                = _mm256_sub_ps(r20,rswitch);
2903             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2904             d2               = _mm256_mul_ps(d,d);
2905             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2906
2907             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2908
2909             /* Evaluate switch function */
2910             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2911             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2912             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2913
2914             fscal            = felec;
2915
2916             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2917
2918             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2919
2920             /* Calculate temporary vectorial force */
2921             tx               = _mm256_mul_ps(fscal,dx20);
2922             ty               = _mm256_mul_ps(fscal,dy20);
2923             tz               = _mm256_mul_ps(fscal,dz20);
2924
2925             /* Update vectorial force */
2926             fix2             = _mm256_add_ps(fix2,tx);
2927             fiy2             = _mm256_add_ps(fiy2,ty);
2928             fiz2             = _mm256_add_ps(fiz2,tz);
2929
2930             fjx0             = _mm256_add_ps(fjx0,tx);
2931             fjy0             = _mm256_add_ps(fjy0,ty);
2932             fjz0             = _mm256_add_ps(fjz0,tz);
2933
2934             }
2935
2936             /**************************
2937              * CALCULATE INTERACTIONS *
2938              **************************/
2939
2940             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2941             {
2942
2943             r21              = _mm256_mul_ps(rsq21,rinv21);
2944             r21              = _mm256_andnot_ps(dummy_mask,r21);
2945
2946             /* EWALD ELECTROSTATICS */
2947             
2948             /* Analytical PME correction */
2949             zeta2            = _mm256_mul_ps(beta2,rsq21);
2950             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2951             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2952             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2953             felec            = _mm256_mul_ps(qq21,felec);
2954             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2955             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2956             velec            = _mm256_sub_ps(rinv21,pmecorrV);
2957             velec            = _mm256_mul_ps(qq21,velec);
2958             
2959             d                = _mm256_sub_ps(r21,rswitch);
2960             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2961             d2               = _mm256_mul_ps(d,d);
2962             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2963
2964             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2965
2966             /* Evaluate switch function */
2967             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2968             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2969             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2970
2971             fscal            = felec;
2972
2973             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2974
2975             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2976
2977             /* Calculate temporary vectorial force */
2978             tx               = _mm256_mul_ps(fscal,dx21);
2979             ty               = _mm256_mul_ps(fscal,dy21);
2980             tz               = _mm256_mul_ps(fscal,dz21);
2981
2982             /* Update vectorial force */
2983             fix2             = _mm256_add_ps(fix2,tx);
2984             fiy2             = _mm256_add_ps(fiy2,ty);
2985             fiz2             = _mm256_add_ps(fiz2,tz);
2986
2987             fjx1             = _mm256_add_ps(fjx1,tx);
2988             fjy1             = _mm256_add_ps(fjy1,ty);
2989             fjz1             = _mm256_add_ps(fjz1,tz);
2990
2991             }
2992
2993             /**************************
2994              * CALCULATE INTERACTIONS *
2995              **************************/
2996
2997             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2998             {
2999
3000             r22              = _mm256_mul_ps(rsq22,rinv22);
3001             r22              = _mm256_andnot_ps(dummy_mask,r22);
3002
3003             /* EWALD ELECTROSTATICS */
3004             
3005             /* Analytical PME correction */
3006             zeta2            = _mm256_mul_ps(beta2,rsq22);
3007             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
3008             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
3009             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
3010             felec            = _mm256_mul_ps(qq22,felec);
3011             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
3012             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
3013             velec            = _mm256_sub_ps(rinv22,pmecorrV);
3014             velec            = _mm256_mul_ps(qq22,velec);
3015             
3016             d                = _mm256_sub_ps(r22,rswitch);
3017             d                = _mm256_max_ps(d,_mm256_setzero_ps());
3018             d2               = _mm256_mul_ps(d,d);
3019             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
3020
3021             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
3022
3023             /* Evaluate switch function */
3024             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
3025             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
3026             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
3027
3028             fscal            = felec;
3029
3030             fscal            = _mm256_and_ps(fscal,cutoff_mask);
3031
3032             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
3033
3034             /* Calculate temporary vectorial force */
3035             tx               = _mm256_mul_ps(fscal,dx22);
3036             ty               = _mm256_mul_ps(fscal,dy22);
3037             tz               = _mm256_mul_ps(fscal,dz22);
3038
3039             /* Update vectorial force */
3040             fix2             = _mm256_add_ps(fix2,tx);
3041             fiy2             = _mm256_add_ps(fiy2,ty);
3042             fiz2             = _mm256_add_ps(fiz2,tz);
3043
3044             fjx2             = _mm256_add_ps(fjx2,tx);
3045             fjy2             = _mm256_add_ps(fjy2,ty);
3046             fjz2             = _mm256_add_ps(fjz2,tz);
3047
3048             }
3049
3050             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3051             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3052             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3053             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3054             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3055             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3056             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3057             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3058
3059             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
3060                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3061
3062             /* Inner loop uses 969 flops */
3063         }
3064
3065         /* End of innermost loop */
3066
3067         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3068                                                  f+i_coord_offset,fshift+i_shift_offset);
3069
3070         /* Increment number of inner iterations */
3071         inneriter                  += j_index_end - j_index_start;
3072
3073         /* Outer loop uses 18 flops */
3074     }
3075
3076     /* Increment number of outer iterations */
3077     outeriter        += nri;
3078
3079     /* Update outer/inner flops */
3080
3081     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*969);
3082 }