Version bumps after new release
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEwSw_VdwNone_GeomW3W3_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            None
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_VF_avx_256_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrE,jnrF,jnrG,jnrH;
78     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
83     real             rcutoff_scalar;
84     real             *shiftvec,*fshift,*x,*f;
85     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
86     real             scratch[4*DIM];
87     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88     real *           vdwioffsetptr0;
89     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90     real *           vdwioffsetptr1;
91     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92     real *           vdwioffsetptr2;
93     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
102     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
103     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
104     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
105     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
106     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
107     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
108     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
109     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     __m256i          ewitab;
112     __m128i          ewitab_lo,ewitab_hi;
113     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
114     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
115     real             *ewtab;
116     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
117     real             rswitch_scalar,d_scalar;
118     __m256           dummy_mask,cutoff_mask;
119     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
120     __m256           one     = _mm256_set1_ps(1.0);
121     __m256           two     = _mm256_set1_ps(2.0);
122     x                = xx[0];
123     f                = ff[0];
124
125     nri              = nlist->nri;
126     iinr             = nlist->iinr;
127     jindex           = nlist->jindex;
128     jjnr             = nlist->jjnr;
129     shiftidx         = nlist->shift;
130     gid              = nlist->gid;
131     shiftvec         = fr->shift_vec[0];
132     fshift           = fr->fshift[0];
133     facel            = _mm256_set1_ps(fr->epsfac);
134     charge           = mdatoms->chargeA;
135
136     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
137     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
138     beta2            = _mm256_mul_ps(beta,beta);
139     beta3            = _mm256_mul_ps(beta,beta2);
140
141     ewtab            = fr->ic->tabq_coul_FDV0;
142     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
143     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
144
145     /* Setup water-specific parameters */
146     inr              = nlist->iinr[0];
147     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
148     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
149     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
150
151     jq0              = _mm256_set1_ps(charge[inr+0]);
152     jq1              = _mm256_set1_ps(charge[inr+1]);
153     jq2              = _mm256_set1_ps(charge[inr+2]);
154     qq00             = _mm256_mul_ps(iq0,jq0);
155     qq01             = _mm256_mul_ps(iq0,jq1);
156     qq02             = _mm256_mul_ps(iq0,jq2);
157     qq10             = _mm256_mul_ps(iq1,jq0);
158     qq11             = _mm256_mul_ps(iq1,jq1);
159     qq12             = _mm256_mul_ps(iq1,jq2);
160     qq20             = _mm256_mul_ps(iq2,jq0);
161     qq21             = _mm256_mul_ps(iq2,jq1);
162     qq22             = _mm256_mul_ps(iq2,jq2);
163
164     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
165     rcutoff_scalar   = fr->rcoulomb;
166     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
167     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
168
169     rswitch_scalar   = fr->rcoulomb_switch;
170     rswitch          = _mm256_set1_ps(rswitch_scalar);
171     /* Setup switch parameters */
172     d_scalar         = rcutoff_scalar-rswitch_scalar;
173     d                = _mm256_set1_ps(d_scalar);
174     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
175     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
176     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
177     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
178     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
179     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
180
181     /* Avoid stupid compiler warnings */
182     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
183     j_coord_offsetA = 0;
184     j_coord_offsetB = 0;
185     j_coord_offsetC = 0;
186     j_coord_offsetD = 0;
187     j_coord_offsetE = 0;
188     j_coord_offsetF = 0;
189     j_coord_offsetG = 0;
190     j_coord_offsetH = 0;
191
192     outeriter        = 0;
193     inneriter        = 0;
194
195     for(iidx=0;iidx<4*DIM;iidx++)
196     {
197         scratch[iidx] = 0.0;
198     }
199
200     /* Start outer loop over neighborlists */
201     for(iidx=0; iidx<nri; iidx++)
202     {
203         /* Load shift vector for this list */
204         i_shift_offset   = DIM*shiftidx[iidx];
205
206         /* Load limits for loop over neighbors */
207         j_index_start    = jindex[iidx];
208         j_index_end      = jindex[iidx+1];
209
210         /* Get outer coordinate index */
211         inr              = iinr[iidx];
212         i_coord_offset   = DIM*inr;
213
214         /* Load i particle coords and add shift vector */
215         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
216                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
217
218         fix0             = _mm256_setzero_ps();
219         fiy0             = _mm256_setzero_ps();
220         fiz0             = _mm256_setzero_ps();
221         fix1             = _mm256_setzero_ps();
222         fiy1             = _mm256_setzero_ps();
223         fiz1             = _mm256_setzero_ps();
224         fix2             = _mm256_setzero_ps();
225         fiy2             = _mm256_setzero_ps();
226         fiz2             = _mm256_setzero_ps();
227
228         /* Reset potential sums */
229         velecsum         = _mm256_setzero_ps();
230
231         /* Start inner kernel loop */
232         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
233         {
234
235             /* Get j neighbor index, and coordinate index */
236             jnrA             = jjnr[jidx];
237             jnrB             = jjnr[jidx+1];
238             jnrC             = jjnr[jidx+2];
239             jnrD             = jjnr[jidx+3];
240             jnrE             = jjnr[jidx+4];
241             jnrF             = jjnr[jidx+5];
242             jnrG             = jjnr[jidx+6];
243             jnrH             = jjnr[jidx+7];
244             j_coord_offsetA  = DIM*jnrA;
245             j_coord_offsetB  = DIM*jnrB;
246             j_coord_offsetC  = DIM*jnrC;
247             j_coord_offsetD  = DIM*jnrD;
248             j_coord_offsetE  = DIM*jnrE;
249             j_coord_offsetF  = DIM*jnrF;
250             j_coord_offsetG  = DIM*jnrG;
251             j_coord_offsetH  = DIM*jnrH;
252
253             /* load j atom coordinates */
254             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
255                                                  x+j_coord_offsetC,x+j_coord_offsetD,
256                                                  x+j_coord_offsetE,x+j_coord_offsetF,
257                                                  x+j_coord_offsetG,x+j_coord_offsetH,
258                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
259
260             /* Calculate displacement vector */
261             dx00             = _mm256_sub_ps(ix0,jx0);
262             dy00             = _mm256_sub_ps(iy0,jy0);
263             dz00             = _mm256_sub_ps(iz0,jz0);
264             dx01             = _mm256_sub_ps(ix0,jx1);
265             dy01             = _mm256_sub_ps(iy0,jy1);
266             dz01             = _mm256_sub_ps(iz0,jz1);
267             dx02             = _mm256_sub_ps(ix0,jx2);
268             dy02             = _mm256_sub_ps(iy0,jy2);
269             dz02             = _mm256_sub_ps(iz0,jz2);
270             dx10             = _mm256_sub_ps(ix1,jx0);
271             dy10             = _mm256_sub_ps(iy1,jy0);
272             dz10             = _mm256_sub_ps(iz1,jz0);
273             dx11             = _mm256_sub_ps(ix1,jx1);
274             dy11             = _mm256_sub_ps(iy1,jy1);
275             dz11             = _mm256_sub_ps(iz1,jz1);
276             dx12             = _mm256_sub_ps(ix1,jx2);
277             dy12             = _mm256_sub_ps(iy1,jy2);
278             dz12             = _mm256_sub_ps(iz1,jz2);
279             dx20             = _mm256_sub_ps(ix2,jx0);
280             dy20             = _mm256_sub_ps(iy2,jy0);
281             dz20             = _mm256_sub_ps(iz2,jz0);
282             dx21             = _mm256_sub_ps(ix2,jx1);
283             dy21             = _mm256_sub_ps(iy2,jy1);
284             dz21             = _mm256_sub_ps(iz2,jz1);
285             dx22             = _mm256_sub_ps(ix2,jx2);
286             dy22             = _mm256_sub_ps(iy2,jy2);
287             dz22             = _mm256_sub_ps(iz2,jz2);
288
289             /* Calculate squared distance and things based on it */
290             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
291             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
292             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
293             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
294             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
295             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
296             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
297             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
298             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
299
300             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
301             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
302             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
303             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
304             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
305             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
306             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
307             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
308             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
309
310             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
311             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
312             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
313             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
314             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
315             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
316             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
317             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
318             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
319
320             fjx0             = _mm256_setzero_ps();
321             fjy0             = _mm256_setzero_ps();
322             fjz0             = _mm256_setzero_ps();
323             fjx1             = _mm256_setzero_ps();
324             fjy1             = _mm256_setzero_ps();
325             fjz1             = _mm256_setzero_ps();
326             fjx2             = _mm256_setzero_ps();
327             fjy2             = _mm256_setzero_ps();
328             fjz2             = _mm256_setzero_ps();
329
330             /**************************
331              * CALCULATE INTERACTIONS *
332              **************************/
333
334             if (gmx_mm256_any_lt(rsq00,rcutoff2))
335             {
336
337             r00              = _mm256_mul_ps(rsq00,rinv00);
338
339             /* EWALD ELECTROSTATICS */
340             
341             /* Analytical PME correction */
342             zeta2            = _mm256_mul_ps(beta2,rsq00);
343             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
344             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
345             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
346             felec            = _mm256_mul_ps(qq00,felec);
347             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
348             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
349             velec            = _mm256_sub_ps(rinv00,pmecorrV);
350             velec            = _mm256_mul_ps(qq00,velec);
351             
352             d                = _mm256_sub_ps(r00,rswitch);
353             d                = _mm256_max_ps(d,_mm256_setzero_ps());
354             d2               = _mm256_mul_ps(d,d);
355             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
356
357             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
358
359             /* Evaluate switch function */
360             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
361             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
362             velec            = _mm256_mul_ps(velec,sw);
363             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
364
365             /* Update potential sum for this i atom from the interaction with this j atom. */
366             velec            = _mm256_and_ps(velec,cutoff_mask);
367             velecsum         = _mm256_add_ps(velecsum,velec);
368
369             fscal            = felec;
370
371             fscal            = _mm256_and_ps(fscal,cutoff_mask);
372
373             /* Calculate temporary vectorial force */
374             tx               = _mm256_mul_ps(fscal,dx00);
375             ty               = _mm256_mul_ps(fscal,dy00);
376             tz               = _mm256_mul_ps(fscal,dz00);
377
378             /* Update vectorial force */
379             fix0             = _mm256_add_ps(fix0,tx);
380             fiy0             = _mm256_add_ps(fiy0,ty);
381             fiz0             = _mm256_add_ps(fiz0,tz);
382
383             fjx0             = _mm256_add_ps(fjx0,tx);
384             fjy0             = _mm256_add_ps(fjy0,ty);
385             fjz0             = _mm256_add_ps(fjz0,tz);
386
387             }
388
389             /**************************
390              * CALCULATE INTERACTIONS *
391              **************************/
392
393             if (gmx_mm256_any_lt(rsq01,rcutoff2))
394             {
395
396             r01              = _mm256_mul_ps(rsq01,rinv01);
397
398             /* EWALD ELECTROSTATICS */
399             
400             /* Analytical PME correction */
401             zeta2            = _mm256_mul_ps(beta2,rsq01);
402             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
403             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
404             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
405             felec            = _mm256_mul_ps(qq01,felec);
406             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
407             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
408             velec            = _mm256_sub_ps(rinv01,pmecorrV);
409             velec            = _mm256_mul_ps(qq01,velec);
410             
411             d                = _mm256_sub_ps(r01,rswitch);
412             d                = _mm256_max_ps(d,_mm256_setzero_ps());
413             d2               = _mm256_mul_ps(d,d);
414             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
415
416             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
417
418             /* Evaluate switch function */
419             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
420             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
421             velec            = _mm256_mul_ps(velec,sw);
422             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
423
424             /* Update potential sum for this i atom from the interaction with this j atom. */
425             velec            = _mm256_and_ps(velec,cutoff_mask);
426             velecsum         = _mm256_add_ps(velecsum,velec);
427
428             fscal            = felec;
429
430             fscal            = _mm256_and_ps(fscal,cutoff_mask);
431
432             /* Calculate temporary vectorial force */
433             tx               = _mm256_mul_ps(fscal,dx01);
434             ty               = _mm256_mul_ps(fscal,dy01);
435             tz               = _mm256_mul_ps(fscal,dz01);
436
437             /* Update vectorial force */
438             fix0             = _mm256_add_ps(fix0,tx);
439             fiy0             = _mm256_add_ps(fiy0,ty);
440             fiz0             = _mm256_add_ps(fiz0,tz);
441
442             fjx1             = _mm256_add_ps(fjx1,tx);
443             fjy1             = _mm256_add_ps(fjy1,ty);
444             fjz1             = _mm256_add_ps(fjz1,tz);
445
446             }
447
448             /**************************
449              * CALCULATE INTERACTIONS *
450              **************************/
451
452             if (gmx_mm256_any_lt(rsq02,rcutoff2))
453             {
454
455             r02              = _mm256_mul_ps(rsq02,rinv02);
456
457             /* EWALD ELECTROSTATICS */
458             
459             /* Analytical PME correction */
460             zeta2            = _mm256_mul_ps(beta2,rsq02);
461             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
462             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
463             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
464             felec            = _mm256_mul_ps(qq02,felec);
465             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
466             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
467             velec            = _mm256_sub_ps(rinv02,pmecorrV);
468             velec            = _mm256_mul_ps(qq02,velec);
469             
470             d                = _mm256_sub_ps(r02,rswitch);
471             d                = _mm256_max_ps(d,_mm256_setzero_ps());
472             d2               = _mm256_mul_ps(d,d);
473             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
474
475             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
476
477             /* Evaluate switch function */
478             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
479             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
480             velec            = _mm256_mul_ps(velec,sw);
481             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
482
483             /* Update potential sum for this i atom from the interaction with this j atom. */
484             velec            = _mm256_and_ps(velec,cutoff_mask);
485             velecsum         = _mm256_add_ps(velecsum,velec);
486
487             fscal            = felec;
488
489             fscal            = _mm256_and_ps(fscal,cutoff_mask);
490
491             /* Calculate temporary vectorial force */
492             tx               = _mm256_mul_ps(fscal,dx02);
493             ty               = _mm256_mul_ps(fscal,dy02);
494             tz               = _mm256_mul_ps(fscal,dz02);
495
496             /* Update vectorial force */
497             fix0             = _mm256_add_ps(fix0,tx);
498             fiy0             = _mm256_add_ps(fiy0,ty);
499             fiz0             = _mm256_add_ps(fiz0,tz);
500
501             fjx2             = _mm256_add_ps(fjx2,tx);
502             fjy2             = _mm256_add_ps(fjy2,ty);
503             fjz2             = _mm256_add_ps(fjz2,tz);
504
505             }
506
507             /**************************
508              * CALCULATE INTERACTIONS *
509              **************************/
510
511             if (gmx_mm256_any_lt(rsq10,rcutoff2))
512             {
513
514             r10              = _mm256_mul_ps(rsq10,rinv10);
515
516             /* EWALD ELECTROSTATICS */
517             
518             /* Analytical PME correction */
519             zeta2            = _mm256_mul_ps(beta2,rsq10);
520             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
521             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
522             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
523             felec            = _mm256_mul_ps(qq10,felec);
524             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
525             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
526             velec            = _mm256_sub_ps(rinv10,pmecorrV);
527             velec            = _mm256_mul_ps(qq10,velec);
528             
529             d                = _mm256_sub_ps(r10,rswitch);
530             d                = _mm256_max_ps(d,_mm256_setzero_ps());
531             d2               = _mm256_mul_ps(d,d);
532             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
533
534             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
535
536             /* Evaluate switch function */
537             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
538             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
539             velec            = _mm256_mul_ps(velec,sw);
540             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
541
542             /* Update potential sum for this i atom from the interaction with this j atom. */
543             velec            = _mm256_and_ps(velec,cutoff_mask);
544             velecsum         = _mm256_add_ps(velecsum,velec);
545
546             fscal            = felec;
547
548             fscal            = _mm256_and_ps(fscal,cutoff_mask);
549
550             /* Calculate temporary vectorial force */
551             tx               = _mm256_mul_ps(fscal,dx10);
552             ty               = _mm256_mul_ps(fscal,dy10);
553             tz               = _mm256_mul_ps(fscal,dz10);
554
555             /* Update vectorial force */
556             fix1             = _mm256_add_ps(fix1,tx);
557             fiy1             = _mm256_add_ps(fiy1,ty);
558             fiz1             = _mm256_add_ps(fiz1,tz);
559
560             fjx0             = _mm256_add_ps(fjx0,tx);
561             fjy0             = _mm256_add_ps(fjy0,ty);
562             fjz0             = _mm256_add_ps(fjz0,tz);
563
564             }
565
566             /**************************
567              * CALCULATE INTERACTIONS *
568              **************************/
569
570             if (gmx_mm256_any_lt(rsq11,rcutoff2))
571             {
572
573             r11              = _mm256_mul_ps(rsq11,rinv11);
574
575             /* EWALD ELECTROSTATICS */
576             
577             /* Analytical PME correction */
578             zeta2            = _mm256_mul_ps(beta2,rsq11);
579             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
580             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
581             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
582             felec            = _mm256_mul_ps(qq11,felec);
583             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
584             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
585             velec            = _mm256_sub_ps(rinv11,pmecorrV);
586             velec            = _mm256_mul_ps(qq11,velec);
587             
588             d                = _mm256_sub_ps(r11,rswitch);
589             d                = _mm256_max_ps(d,_mm256_setzero_ps());
590             d2               = _mm256_mul_ps(d,d);
591             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
592
593             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
594
595             /* Evaluate switch function */
596             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
597             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
598             velec            = _mm256_mul_ps(velec,sw);
599             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
600
601             /* Update potential sum for this i atom from the interaction with this j atom. */
602             velec            = _mm256_and_ps(velec,cutoff_mask);
603             velecsum         = _mm256_add_ps(velecsum,velec);
604
605             fscal            = felec;
606
607             fscal            = _mm256_and_ps(fscal,cutoff_mask);
608
609             /* Calculate temporary vectorial force */
610             tx               = _mm256_mul_ps(fscal,dx11);
611             ty               = _mm256_mul_ps(fscal,dy11);
612             tz               = _mm256_mul_ps(fscal,dz11);
613
614             /* Update vectorial force */
615             fix1             = _mm256_add_ps(fix1,tx);
616             fiy1             = _mm256_add_ps(fiy1,ty);
617             fiz1             = _mm256_add_ps(fiz1,tz);
618
619             fjx1             = _mm256_add_ps(fjx1,tx);
620             fjy1             = _mm256_add_ps(fjy1,ty);
621             fjz1             = _mm256_add_ps(fjz1,tz);
622
623             }
624
625             /**************************
626              * CALCULATE INTERACTIONS *
627              **************************/
628
629             if (gmx_mm256_any_lt(rsq12,rcutoff2))
630             {
631
632             r12              = _mm256_mul_ps(rsq12,rinv12);
633
634             /* EWALD ELECTROSTATICS */
635             
636             /* Analytical PME correction */
637             zeta2            = _mm256_mul_ps(beta2,rsq12);
638             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
639             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
640             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
641             felec            = _mm256_mul_ps(qq12,felec);
642             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
643             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
644             velec            = _mm256_sub_ps(rinv12,pmecorrV);
645             velec            = _mm256_mul_ps(qq12,velec);
646             
647             d                = _mm256_sub_ps(r12,rswitch);
648             d                = _mm256_max_ps(d,_mm256_setzero_ps());
649             d2               = _mm256_mul_ps(d,d);
650             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
651
652             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
653
654             /* Evaluate switch function */
655             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
656             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
657             velec            = _mm256_mul_ps(velec,sw);
658             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
659
660             /* Update potential sum for this i atom from the interaction with this j atom. */
661             velec            = _mm256_and_ps(velec,cutoff_mask);
662             velecsum         = _mm256_add_ps(velecsum,velec);
663
664             fscal            = felec;
665
666             fscal            = _mm256_and_ps(fscal,cutoff_mask);
667
668             /* Calculate temporary vectorial force */
669             tx               = _mm256_mul_ps(fscal,dx12);
670             ty               = _mm256_mul_ps(fscal,dy12);
671             tz               = _mm256_mul_ps(fscal,dz12);
672
673             /* Update vectorial force */
674             fix1             = _mm256_add_ps(fix1,tx);
675             fiy1             = _mm256_add_ps(fiy1,ty);
676             fiz1             = _mm256_add_ps(fiz1,tz);
677
678             fjx2             = _mm256_add_ps(fjx2,tx);
679             fjy2             = _mm256_add_ps(fjy2,ty);
680             fjz2             = _mm256_add_ps(fjz2,tz);
681
682             }
683
684             /**************************
685              * CALCULATE INTERACTIONS *
686              **************************/
687
688             if (gmx_mm256_any_lt(rsq20,rcutoff2))
689             {
690
691             r20              = _mm256_mul_ps(rsq20,rinv20);
692
693             /* EWALD ELECTROSTATICS */
694             
695             /* Analytical PME correction */
696             zeta2            = _mm256_mul_ps(beta2,rsq20);
697             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
698             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
699             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
700             felec            = _mm256_mul_ps(qq20,felec);
701             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
702             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
703             velec            = _mm256_sub_ps(rinv20,pmecorrV);
704             velec            = _mm256_mul_ps(qq20,velec);
705             
706             d                = _mm256_sub_ps(r20,rswitch);
707             d                = _mm256_max_ps(d,_mm256_setzero_ps());
708             d2               = _mm256_mul_ps(d,d);
709             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
710
711             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
712
713             /* Evaluate switch function */
714             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
715             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
716             velec            = _mm256_mul_ps(velec,sw);
717             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
718
719             /* Update potential sum for this i atom from the interaction with this j atom. */
720             velec            = _mm256_and_ps(velec,cutoff_mask);
721             velecsum         = _mm256_add_ps(velecsum,velec);
722
723             fscal            = felec;
724
725             fscal            = _mm256_and_ps(fscal,cutoff_mask);
726
727             /* Calculate temporary vectorial force */
728             tx               = _mm256_mul_ps(fscal,dx20);
729             ty               = _mm256_mul_ps(fscal,dy20);
730             tz               = _mm256_mul_ps(fscal,dz20);
731
732             /* Update vectorial force */
733             fix2             = _mm256_add_ps(fix2,tx);
734             fiy2             = _mm256_add_ps(fiy2,ty);
735             fiz2             = _mm256_add_ps(fiz2,tz);
736
737             fjx0             = _mm256_add_ps(fjx0,tx);
738             fjy0             = _mm256_add_ps(fjy0,ty);
739             fjz0             = _mm256_add_ps(fjz0,tz);
740
741             }
742
743             /**************************
744              * CALCULATE INTERACTIONS *
745              **************************/
746
747             if (gmx_mm256_any_lt(rsq21,rcutoff2))
748             {
749
750             r21              = _mm256_mul_ps(rsq21,rinv21);
751
752             /* EWALD ELECTROSTATICS */
753             
754             /* Analytical PME correction */
755             zeta2            = _mm256_mul_ps(beta2,rsq21);
756             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
757             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
758             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
759             felec            = _mm256_mul_ps(qq21,felec);
760             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
761             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
762             velec            = _mm256_sub_ps(rinv21,pmecorrV);
763             velec            = _mm256_mul_ps(qq21,velec);
764             
765             d                = _mm256_sub_ps(r21,rswitch);
766             d                = _mm256_max_ps(d,_mm256_setzero_ps());
767             d2               = _mm256_mul_ps(d,d);
768             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
769
770             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
771
772             /* Evaluate switch function */
773             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
774             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
775             velec            = _mm256_mul_ps(velec,sw);
776             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
777
778             /* Update potential sum for this i atom from the interaction with this j atom. */
779             velec            = _mm256_and_ps(velec,cutoff_mask);
780             velecsum         = _mm256_add_ps(velecsum,velec);
781
782             fscal            = felec;
783
784             fscal            = _mm256_and_ps(fscal,cutoff_mask);
785
786             /* Calculate temporary vectorial force */
787             tx               = _mm256_mul_ps(fscal,dx21);
788             ty               = _mm256_mul_ps(fscal,dy21);
789             tz               = _mm256_mul_ps(fscal,dz21);
790
791             /* Update vectorial force */
792             fix2             = _mm256_add_ps(fix2,tx);
793             fiy2             = _mm256_add_ps(fiy2,ty);
794             fiz2             = _mm256_add_ps(fiz2,tz);
795
796             fjx1             = _mm256_add_ps(fjx1,tx);
797             fjy1             = _mm256_add_ps(fjy1,ty);
798             fjz1             = _mm256_add_ps(fjz1,tz);
799
800             }
801
802             /**************************
803              * CALCULATE INTERACTIONS *
804              **************************/
805
806             if (gmx_mm256_any_lt(rsq22,rcutoff2))
807             {
808
809             r22              = _mm256_mul_ps(rsq22,rinv22);
810
811             /* EWALD ELECTROSTATICS */
812             
813             /* Analytical PME correction */
814             zeta2            = _mm256_mul_ps(beta2,rsq22);
815             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
816             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
817             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
818             felec            = _mm256_mul_ps(qq22,felec);
819             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
820             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
821             velec            = _mm256_sub_ps(rinv22,pmecorrV);
822             velec            = _mm256_mul_ps(qq22,velec);
823             
824             d                = _mm256_sub_ps(r22,rswitch);
825             d                = _mm256_max_ps(d,_mm256_setzero_ps());
826             d2               = _mm256_mul_ps(d,d);
827             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
828
829             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
830
831             /* Evaluate switch function */
832             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
833             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
834             velec            = _mm256_mul_ps(velec,sw);
835             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
836
837             /* Update potential sum for this i atom from the interaction with this j atom. */
838             velec            = _mm256_and_ps(velec,cutoff_mask);
839             velecsum         = _mm256_add_ps(velecsum,velec);
840
841             fscal            = felec;
842
843             fscal            = _mm256_and_ps(fscal,cutoff_mask);
844
845             /* Calculate temporary vectorial force */
846             tx               = _mm256_mul_ps(fscal,dx22);
847             ty               = _mm256_mul_ps(fscal,dy22);
848             tz               = _mm256_mul_ps(fscal,dz22);
849
850             /* Update vectorial force */
851             fix2             = _mm256_add_ps(fix2,tx);
852             fiy2             = _mm256_add_ps(fiy2,ty);
853             fiz2             = _mm256_add_ps(fiz2,tz);
854
855             fjx2             = _mm256_add_ps(fjx2,tx);
856             fjy2             = _mm256_add_ps(fjy2,ty);
857             fjz2             = _mm256_add_ps(fjz2,tz);
858
859             }
860
861             fjptrA             = f+j_coord_offsetA;
862             fjptrB             = f+j_coord_offsetB;
863             fjptrC             = f+j_coord_offsetC;
864             fjptrD             = f+j_coord_offsetD;
865             fjptrE             = f+j_coord_offsetE;
866             fjptrF             = f+j_coord_offsetF;
867             fjptrG             = f+j_coord_offsetG;
868             fjptrH             = f+j_coord_offsetH;
869
870             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
871                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
872
873             /* Inner loop uses 972 flops */
874         }
875
876         if(jidx<j_index_end)
877         {
878
879             /* Get j neighbor index, and coordinate index */
880             jnrlistA         = jjnr[jidx];
881             jnrlistB         = jjnr[jidx+1];
882             jnrlistC         = jjnr[jidx+2];
883             jnrlistD         = jjnr[jidx+3];
884             jnrlistE         = jjnr[jidx+4];
885             jnrlistF         = jjnr[jidx+5];
886             jnrlistG         = jjnr[jidx+6];
887             jnrlistH         = jjnr[jidx+7];
888             /* Sign of each element will be negative for non-real atoms.
889              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
890              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
891              */
892             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
893                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
894                                             
895             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
896             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
897             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
898             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
899             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
900             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
901             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
902             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
903             j_coord_offsetA  = DIM*jnrA;
904             j_coord_offsetB  = DIM*jnrB;
905             j_coord_offsetC  = DIM*jnrC;
906             j_coord_offsetD  = DIM*jnrD;
907             j_coord_offsetE  = DIM*jnrE;
908             j_coord_offsetF  = DIM*jnrF;
909             j_coord_offsetG  = DIM*jnrG;
910             j_coord_offsetH  = DIM*jnrH;
911
912             /* load j atom coordinates */
913             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
914                                                  x+j_coord_offsetC,x+j_coord_offsetD,
915                                                  x+j_coord_offsetE,x+j_coord_offsetF,
916                                                  x+j_coord_offsetG,x+j_coord_offsetH,
917                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
918
919             /* Calculate displacement vector */
920             dx00             = _mm256_sub_ps(ix0,jx0);
921             dy00             = _mm256_sub_ps(iy0,jy0);
922             dz00             = _mm256_sub_ps(iz0,jz0);
923             dx01             = _mm256_sub_ps(ix0,jx1);
924             dy01             = _mm256_sub_ps(iy0,jy1);
925             dz01             = _mm256_sub_ps(iz0,jz1);
926             dx02             = _mm256_sub_ps(ix0,jx2);
927             dy02             = _mm256_sub_ps(iy0,jy2);
928             dz02             = _mm256_sub_ps(iz0,jz2);
929             dx10             = _mm256_sub_ps(ix1,jx0);
930             dy10             = _mm256_sub_ps(iy1,jy0);
931             dz10             = _mm256_sub_ps(iz1,jz0);
932             dx11             = _mm256_sub_ps(ix1,jx1);
933             dy11             = _mm256_sub_ps(iy1,jy1);
934             dz11             = _mm256_sub_ps(iz1,jz1);
935             dx12             = _mm256_sub_ps(ix1,jx2);
936             dy12             = _mm256_sub_ps(iy1,jy2);
937             dz12             = _mm256_sub_ps(iz1,jz2);
938             dx20             = _mm256_sub_ps(ix2,jx0);
939             dy20             = _mm256_sub_ps(iy2,jy0);
940             dz20             = _mm256_sub_ps(iz2,jz0);
941             dx21             = _mm256_sub_ps(ix2,jx1);
942             dy21             = _mm256_sub_ps(iy2,jy1);
943             dz21             = _mm256_sub_ps(iz2,jz1);
944             dx22             = _mm256_sub_ps(ix2,jx2);
945             dy22             = _mm256_sub_ps(iy2,jy2);
946             dz22             = _mm256_sub_ps(iz2,jz2);
947
948             /* Calculate squared distance and things based on it */
949             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
950             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
951             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
952             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
953             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
954             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
955             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
956             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
957             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
958
959             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
960             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
961             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
962             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
963             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
964             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
965             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
966             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
967             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
968
969             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
970             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
971             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
972             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
973             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
974             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
975             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
976             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
977             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
978
979             fjx0             = _mm256_setzero_ps();
980             fjy0             = _mm256_setzero_ps();
981             fjz0             = _mm256_setzero_ps();
982             fjx1             = _mm256_setzero_ps();
983             fjy1             = _mm256_setzero_ps();
984             fjz1             = _mm256_setzero_ps();
985             fjx2             = _mm256_setzero_ps();
986             fjy2             = _mm256_setzero_ps();
987             fjz2             = _mm256_setzero_ps();
988
989             /**************************
990              * CALCULATE INTERACTIONS *
991              **************************/
992
993             if (gmx_mm256_any_lt(rsq00,rcutoff2))
994             {
995
996             r00              = _mm256_mul_ps(rsq00,rinv00);
997             r00              = _mm256_andnot_ps(dummy_mask,r00);
998
999             /* EWALD ELECTROSTATICS */
1000             
1001             /* Analytical PME correction */
1002             zeta2            = _mm256_mul_ps(beta2,rsq00);
1003             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1004             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1005             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1006             felec            = _mm256_mul_ps(qq00,felec);
1007             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1008             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1009             velec            = _mm256_sub_ps(rinv00,pmecorrV);
1010             velec            = _mm256_mul_ps(qq00,velec);
1011             
1012             d                = _mm256_sub_ps(r00,rswitch);
1013             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1014             d2               = _mm256_mul_ps(d,d);
1015             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1016
1017             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1018
1019             /* Evaluate switch function */
1020             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1021             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1022             velec            = _mm256_mul_ps(velec,sw);
1023             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1024
1025             /* Update potential sum for this i atom from the interaction with this j atom. */
1026             velec            = _mm256_and_ps(velec,cutoff_mask);
1027             velec            = _mm256_andnot_ps(dummy_mask,velec);
1028             velecsum         = _mm256_add_ps(velecsum,velec);
1029
1030             fscal            = felec;
1031
1032             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1033
1034             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1035
1036             /* Calculate temporary vectorial force */
1037             tx               = _mm256_mul_ps(fscal,dx00);
1038             ty               = _mm256_mul_ps(fscal,dy00);
1039             tz               = _mm256_mul_ps(fscal,dz00);
1040
1041             /* Update vectorial force */
1042             fix0             = _mm256_add_ps(fix0,tx);
1043             fiy0             = _mm256_add_ps(fiy0,ty);
1044             fiz0             = _mm256_add_ps(fiz0,tz);
1045
1046             fjx0             = _mm256_add_ps(fjx0,tx);
1047             fjy0             = _mm256_add_ps(fjy0,ty);
1048             fjz0             = _mm256_add_ps(fjz0,tz);
1049
1050             }
1051
1052             /**************************
1053              * CALCULATE INTERACTIONS *
1054              **************************/
1055
1056             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1057             {
1058
1059             r01              = _mm256_mul_ps(rsq01,rinv01);
1060             r01              = _mm256_andnot_ps(dummy_mask,r01);
1061
1062             /* EWALD ELECTROSTATICS */
1063             
1064             /* Analytical PME correction */
1065             zeta2            = _mm256_mul_ps(beta2,rsq01);
1066             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1067             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1068             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1069             felec            = _mm256_mul_ps(qq01,felec);
1070             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1071             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1072             velec            = _mm256_sub_ps(rinv01,pmecorrV);
1073             velec            = _mm256_mul_ps(qq01,velec);
1074             
1075             d                = _mm256_sub_ps(r01,rswitch);
1076             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1077             d2               = _mm256_mul_ps(d,d);
1078             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1079
1080             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1081
1082             /* Evaluate switch function */
1083             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1084             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
1085             velec            = _mm256_mul_ps(velec,sw);
1086             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1087
1088             /* Update potential sum for this i atom from the interaction with this j atom. */
1089             velec            = _mm256_and_ps(velec,cutoff_mask);
1090             velec            = _mm256_andnot_ps(dummy_mask,velec);
1091             velecsum         = _mm256_add_ps(velecsum,velec);
1092
1093             fscal            = felec;
1094
1095             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1096
1097             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1098
1099             /* Calculate temporary vectorial force */
1100             tx               = _mm256_mul_ps(fscal,dx01);
1101             ty               = _mm256_mul_ps(fscal,dy01);
1102             tz               = _mm256_mul_ps(fscal,dz01);
1103
1104             /* Update vectorial force */
1105             fix0             = _mm256_add_ps(fix0,tx);
1106             fiy0             = _mm256_add_ps(fiy0,ty);
1107             fiz0             = _mm256_add_ps(fiz0,tz);
1108
1109             fjx1             = _mm256_add_ps(fjx1,tx);
1110             fjy1             = _mm256_add_ps(fjy1,ty);
1111             fjz1             = _mm256_add_ps(fjz1,tz);
1112
1113             }
1114
1115             /**************************
1116              * CALCULATE INTERACTIONS *
1117              **************************/
1118
1119             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1120             {
1121
1122             r02              = _mm256_mul_ps(rsq02,rinv02);
1123             r02              = _mm256_andnot_ps(dummy_mask,r02);
1124
1125             /* EWALD ELECTROSTATICS */
1126             
1127             /* Analytical PME correction */
1128             zeta2            = _mm256_mul_ps(beta2,rsq02);
1129             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1130             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1131             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1132             felec            = _mm256_mul_ps(qq02,felec);
1133             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1134             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1135             velec            = _mm256_sub_ps(rinv02,pmecorrV);
1136             velec            = _mm256_mul_ps(qq02,velec);
1137             
1138             d                = _mm256_sub_ps(r02,rswitch);
1139             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1140             d2               = _mm256_mul_ps(d,d);
1141             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1142
1143             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1144
1145             /* Evaluate switch function */
1146             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1147             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
1148             velec            = _mm256_mul_ps(velec,sw);
1149             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1150
1151             /* Update potential sum for this i atom from the interaction with this j atom. */
1152             velec            = _mm256_and_ps(velec,cutoff_mask);
1153             velec            = _mm256_andnot_ps(dummy_mask,velec);
1154             velecsum         = _mm256_add_ps(velecsum,velec);
1155
1156             fscal            = felec;
1157
1158             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1159
1160             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1161
1162             /* Calculate temporary vectorial force */
1163             tx               = _mm256_mul_ps(fscal,dx02);
1164             ty               = _mm256_mul_ps(fscal,dy02);
1165             tz               = _mm256_mul_ps(fscal,dz02);
1166
1167             /* Update vectorial force */
1168             fix0             = _mm256_add_ps(fix0,tx);
1169             fiy0             = _mm256_add_ps(fiy0,ty);
1170             fiz0             = _mm256_add_ps(fiz0,tz);
1171
1172             fjx2             = _mm256_add_ps(fjx2,tx);
1173             fjy2             = _mm256_add_ps(fjy2,ty);
1174             fjz2             = _mm256_add_ps(fjz2,tz);
1175
1176             }
1177
1178             /**************************
1179              * CALCULATE INTERACTIONS *
1180              **************************/
1181
1182             if (gmx_mm256_any_lt(rsq10,rcutoff2))
1183             {
1184
1185             r10              = _mm256_mul_ps(rsq10,rinv10);
1186             r10              = _mm256_andnot_ps(dummy_mask,r10);
1187
1188             /* EWALD ELECTROSTATICS */
1189             
1190             /* Analytical PME correction */
1191             zeta2            = _mm256_mul_ps(beta2,rsq10);
1192             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1193             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1194             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1195             felec            = _mm256_mul_ps(qq10,felec);
1196             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1197             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1198             velec            = _mm256_sub_ps(rinv10,pmecorrV);
1199             velec            = _mm256_mul_ps(qq10,velec);
1200             
1201             d                = _mm256_sub_ps(r10,rswitch);
1202             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1203             d2               = _mm256_mul_ps(d,d);
1204             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1205
1206             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1207
1208             /* Evaluate switch function */
1209             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1210             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
1211             velec            = _mm256_mul_ps(velec,sw);
1212             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1213
1214             /* Update potential sum for this i atom from the interaction with this j atom. */
1215             velec            = _mm256_and_ps(velec,cutoff_mask);
1216             velec            = _mm256_andnot_ps(dummy_mask,velec);
1217             velecsum         = _mm256_add_ps(velecsum,velec);
1218
1219             fscal            = felec;
1220
1221             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1222
1223             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1224
1225             /* Calculate temporary vectorial force */
1226             tx               = _mm256_mul_ps(fscal,dx10);
1227             ty               = _mm256_mul_ps(fscal,dy10);
1228             tz               = _mm256_mul_ps(fscal,dz10);
1229
1230             /* Update vectorial force */
1231             fix1             = _mm256_add_ps(fix1,tx);
1232             fiy1             = _mm256_add_ps(fiy1,ty);
1233             fiz1             = _mm256_add_ps(fiz1,tz);
1234
1235             fjx0             = _mm256_add_ps(fjx0,tx);
1236             fjy0             = _mm256_add_ps(fjy0,ty);
1237             fjz0             = _mm256_add_ps(fjz0,tz);
1238
1239             }
1240
1241             /**************************
1242              * CALCULATE INTERACTIONS *
1243              **************************/
1244
1245             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1246             {
1247
1248             r11              = _mm256_mul_ps(rsq11,rinv11);
1249             r11              = _mm256_andnot_ps(dummy_mask,r11);
1250
1251             /* EWALD ELECTROSTATICS */
1252             
1253             /* Analytical PME correction */
1254             zeta2            = _mm256_mul_ps(beta2,rsq11);
1255             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1256             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1257             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1258             felec            = _mm256_mul_ps(qq11,felec);
1259             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1260             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1261             velec            = _mm256_sub_ps(rinv11,pmecorrV);
1262             velec            = _mm256_mul_ps(qq11,velec);
1263             
1264             d                = _mm256_sub_ps(r11,rswitch);
1265             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1266             d2               = _mm256_mul_ps(d,d);
1267             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1268
1269             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1270
1271             /* Evaluate switch function */
1272             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1273             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1274             velec            = _mm256_mul_ps(velec,sw);
1275             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1276
1277             /* Update potential sum for this i atom from the interaction with this j atom. */
1278             velec            = _mm256_and_ps(velec,cutoff_mask);
1279             velec            = _mm256_andnot_ps(dummy_mask,velec);
1280             velecsum         = _mm256_add_ps(velecsum,velec);
1281
1282             fscal            = felec;
1283
1284             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1285
1286             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1287
1288             /* Calculate temporary vectorial force */
1289             tx               = _mm256_mul_ps(fscal,dx11);
1290             ty               = _mm256_mul_ps(fscal,dy11);
1291             tz               = _mm256_mul_ps(fscal,dz11);
1292
1293             /* Update vectorial force */
1294             fix1             = _mm256_add_ps(fix1,tx);
1295             fiy1             = _mm256_add_ps(fiy1,ty);
1296             fiz1             = _mm256_add_ps(fiz1,tz);
1297
1298             fjx1             = _mm256_add_ps(fjx1,tx);
1299             fjy1             = _mm256_add_ps(fjy1,ty);
1300             fjz1             = _mm256_add_ps(fjz1,tz);
1301
1302             }
1303
1304             /**************************
1305              * CALCULATE INTERACTIONS *
1306              **************************/
1307
1308             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1309             {
1310
1311             r12              = _mm256_mul_ps(rsq12,rinv12);
1312             r12              = _mm256_andnot_ps(dummy_mask,r12);
1313
1314             /* EWALD ELECTROSTATICS */
1315             
1316             /* Analytical PME correction */
1317             zeta2            = _mm256_mul_ps(beta2,rsq12);
1318             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1319             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1320             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1321             felec            = _mm256_mul_ps(qq12,felec);
1322             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1323             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1324             velec            = _mm256_sub_ps(rinv12,pmecorrV);
1325             velec            = _mm256_mul_ps(qq12,velec);
1326             
1327             d                = _mm256_sub_ps(r12,rswitch);
1328             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1329             d2               = _mm256_mul_ps(d,d);
1330             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1331
1332             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1333
1334             /* Evaluate switch function */
1335             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1336             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1337             velec            = _mm256_mul_ps(velec,sw);
1338             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1339
1340             /* Update potential sum for this i atom from the interaction with this j atom. */
1341             velec            = _mm256_and_ps(velec,cutoff_mask);
1342             velec            = _mm256_andnot_ps(dummy_mask,velec);
1343             velecsum         = _mm256_add_ps(velecsum,velec);
1344
1345             fscal            = felec;
1346
1347             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1348
1349             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1350
1351             /* Calculate temporary vectorial force */
1352             tx               = _mm256_mul_ps(fscal,dx12);
1353             ty               = _mm256_mul_ps(fscal,dy12);
1354             tz               = _mm256_mul_ps(fscal,dz12);
1355
1356             /* Update vectorial force */
1357             fix1             = _mm256_add_ps(fix1,tx);
1358             fiy1             = _mm256_add_ps(fiy1,ty);
1359             fiz1             = _mm256_add_ps(fiz1,tz);
1360
1361             fjx2             = _mm256_add_ps(fjx2,tx);
1362             fjy2             = _mm256_add_ps(fjy2,ty);
1363             fjz2             = _mm256_add_ps(fjz2,tz);
1364
1365             }
1366
1367             /**************************
1368              * CALCULATE INTERACTIONS *
1369              **************************/
1370
1371             if (gmx_mm256_any_lt(rsq20,rcutoff2))
1372             {
1373
1374             r20              = _mm256_mul_ps(rsq20,rinv20);
1375             r20              = _mm256_andnot_ps(dummy_mask,r20);
1376
1377             /* EWALD ELECTROSTATICS */
1378             
1379             /* Analytical PME correction */
1380             zeta2            = _mm256_mul_ps(beta2,rsq20);
1381             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1382             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1383             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1384             felec            = _mm256_mul_ps(qq20,felec);
1385             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1386             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1387             velec            = _mm256_sub_ps(rinv20,pmecorrV);
1388             velec            = _mm256_mul_ps(qq20,velec);
1389             
1390             d                = _mm256_sub_ps(r20,rswitch);
1391             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1392             d2               = _mm256_mul_ps(d,d);
1393             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1394
1395             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1396
1397             /* Evaluate switch function */
1398             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1399             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
1400             velec            = _mm256_mul_ps(velec,sw);
1401             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1402
1403             /* Update potential sum for this i atom from the interaction with this j atom. */
1404             velec            = _mm256_and_ps(velec,cutoff_mask);
1405             velec            = _mm256_andnot_ps(dummy_mask,velec);
1406             velecsum         = _mm256_add_ps(velecsum,velec);
1407
1408             fscal            = felec;
1409
1410             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1411
1412             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1413
1414             /* Calculate temporary vectorial force */
1415             tx               = _mm256_mul_ps(fscal,dx20);
1416             ty               = _mm256_mul_ps(fscal,dy20);
1417             tz               = _mm256_mul_ps(fscal,dz20);
1418
1419             /* Update vectorial force */
1420             fix2             = _mm256_add_ps(fix2,tx);
1421             fiy2             = _mm256_add_ps(fiy2,ty);
1422             fiz2             = _mm256_add_ps(fiz2,tz);
1423
1424             fjx0             = _mm256_add_ps(fjx0,tx);
1425             fjy0             = _mm256_add_ps(fjy0,ty);
1426             fjz0             = _mm256_add_ps(fjz0,tz);
1427
1428             }
1429
1430             /**************************
1431              * CALCULATE INTERACTIONS *
1432              **************************/
1433
1434             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1435             {
1436
1437             r21              = _mm256_mul_ps(rsq21,rinv21);
1438             r21              = _mm256_andnot_ps(dummy_mask,r21);
1439
1440             /* EWALD ELECTROSTATICS */
1441             
1442             /* Analytical PME correction */
1443             zeta2            = _mm256_mul_ps(beta2,rsq21);
1444             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1445             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1446             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1447             felec            = _mm256_mul_ps(qq21,felec);
1448             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1449             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1450             velec            = _mm256_sub_ps(rinv21,pmecorrV);
1451             velec            = _mm256_mul_ps(qq21,velec);
1452             
1453             d                = _mm256_sub_ps(r21,rswitch);
1454             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1455             d2               = _mm256_mul_ps(d,d);
1456             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1457
1458             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1459
1460             /* Evaluate switch function */
1461             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1462             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1463             velec            = _mm256_mul_ps(velec,sw);
1464             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1465
1466             /* Update potential sum for this i atom from the interaction with this j atom. */
1467             velec            = _mm256_and_ps(velec,cutoff_mask);
1468             velec            = _mm256_andnot_ps(dummy_mask,velec);
1469             velecsum         = _mm256_add_ps(velecsum,velec);
1470
1471             fscal            = felec;
1472
1473             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1474
1475             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1476
1477             /* Calculate temporary vectorial force */
1478             tx               = _mm256_mul_ps(fscal,dx21);
1479             ty               = _mm256_mul_ps(fscal,dy21);
1480             tz               = _mm256_mul_ps(fscal,dz21);
1481
1482             /* Update vectorial force */
1483             fix2             = _mm256_add_ps(fix2,tx);
1484             fiy2             = _mm256_add_ps(fiy2,ty);
1485             fiz2             = _mm256_add_ps(fiz2,tz);
1486
1487             fjx1             = _mm256_add_ps(fjx1,tx);
1488             fjy1             = _mm256_add_ps(fjy1,ty);
1489             fjz1             = _mm256_add_ps(fjz1,tz);
1490
1491             }
1492
1493             /**************************
1494              * CALCULATE INTERACTIONS *
1495              **************************/
1496
1497             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1498             {
1499
1500             r22              = _mm256_mul_ps(rsq22,rinv22);
1501             r22              = _mm256_andnot_ps(dummy_mask,r22);
1502
1503             /* EWALD ELECTROSTATICS */
1504             
1505             /* Analytical PME correction */
1506             zeta2            = _mm256_mul_ps(beta2,rsq22);
1507             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1508             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1509             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1510             felec            = _mm256_mul_ps(qq22,felec);
1511             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1512             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1513             velec            = _mm256_sub_ps(rinv22,pmecorrV);
1514             velec            = _mm256_mul_ps(qq22,velec);
1515             
1516             d                = _mm256_sub_ps(r22,rswitch);
1517             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1518             d2               = _mm256_mul_ps(d,d);
1519             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1520
1521             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1522
1523             /* Evaluate switch function */
1524             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1525             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1526             velec            = _mm256_mul_ps(velec,sw);
1527             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1528
1529             /* Update potential sum for this i atom from the interaction with this j atom. */
1530             velec            = _mm256_and_ps(velec,cutoff_mask);
1531             velec            = _mm256_andnot_ps(dummy_mask,velec);
1532             velecsum         = _mm256_add_ps(velecsum,velec);
1533
1534             fscal            = felec;
1535
1536             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1537
1538             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1539
1540             /* Calculate temporary vectorial force */
1541             tx               = _mm256_mul_ps(fscal,dx22);
1542             ty               = _mm256_mul_ps(fscal,dy22);
1543             tz               = _mm256_mul_ps(fscal,dz22);
1544
1545             /* Update vectorial force */
1546             fix2             = _mm256_add_ps(fix2,tx);
1547             fiy2             = _mm256_add_ps(fiy2,ty);
1548             fiz2             = _mm256_add_ps(fiz2,tz);
1549
1550             fjx2             = _mm256_add_ps(fjx2,tx);
1551             fjy2             = _mm256_add_ps(fjy2,ty);
1552             fjz2             = _mm256_add_ps(fjz2,tz);
1553
1554             }
1555
1556             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1557             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1558             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1559             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1560             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1561             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1562             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1563             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1564
1565             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1566                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1567
1568             /* Inner loop uses 981 flops */
1569         }
1570
1571         /* End of innermost loop */
1572
1573         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1574                                                  f+i_coord_offset,fshift+i_shift_offset);
1575
1576         ggid                        = gid[iidx];
1577         /* Update potential energies */
1578         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1579
1580         /* Increment number of inner iterations */
1581         inneriter                  += j_index_end - j_index_start;
1582
1583         /* Outer loop uses 19 flops */
1584     }
1585
1586     /* Increment number of outer iterations */
1587     outeriter        += nri;
1588
1589     /* Update outer/inner flops */
1590
1591     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*981);
1592 }
1593 /*
1594  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_single
1595  * Electrostatics interaction: Ewald
1596  * VdW interaction:            None
1597  * Geometry:                   Water3-Water3
1598  * Calculate force/pot:        Force
1599  */
1600 void
1601 nb_kernel_ElecEwSw_VdwNone_GeomW3W3_F_avx_256_single
1602                     (t_nblist                    * gmx_restrict       nlist,
1603                      rvec                        * gmx_restrict          xx,
1604                      rvec                        * gmx_restrict          ff,
1605                      t_forcerec                  * gmx_restrict          fr,
1606                      t_mdatoms                   * gmx_restrict     mdatoms,
1607                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1608                      t_nrnb                      * gmx_restrict        nrnb)
1609 {
1610     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1611      * just 0 for non-waters.
1612      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1613      * jnr indices corresponding to data put in the four positions in the SIMD register.
1614      */
1615     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1616     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1617     int              jnrA,jnrB,jnrC,jnrD;
1618     int              jnrE,jnrF,jnrG,jnrH;
1619     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1620     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1621     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1622     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1623     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1624     real             rcutoff_scalar;
1625     real             *shiftvec,*fshift,*x,*f;
1626     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1627     real             scratch[4*DIM];
1628     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1629     real *           vdwioffsetptr0;
1630     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1631     real *           vdwioffsetptr1;
1632     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1633     real *           vdwioffsetptr2;
1634     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1635     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1636     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1637     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1638     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1639     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1640     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1641     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1642     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1643     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1644     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1645     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1646     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1647     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1648     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1649     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1650     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1651     real             *charge;
1652     __m256i          ewitab;
1653     __m128i          ewitab_lo,ewitab_hi;
1654     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1655     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1656     real             *ewtab;
1657     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1658     real             rswitch_scalar,d_scalar;
1659     __m256           dummy_mask,cutoff_mask;
1660     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1661     __m256           one     = _mm256_set1_ps(1.0);
1662     __m256           two     = _mm256_set1_ps(2.0);
1663     x                = xx[0];
1664     f                = ff[0];
1665
1666     nri              = nlist->nri;
1667     iinr             = nlist->iinr;
1668     jindex           = nlist->jindex;
1669     jjnr             = nlist->jjnr;
1670     shiftidx         = nlist->shift;
1671     gid              = nlist->gid;
1672     shiftvec         = fr->shift_vec[0];
1673     fshift           = fr->fshift[0];
1674     facel            = _mm256_set1_ps(fr->epsfac);
1675     charge           = mdatoms->chargeA;
1676
1677     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1678     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1679     beta2            = _mm256_mul_ps(beta,beta);
1680     beta3            = _mm256_mul_ps(beta,beta2);
1681
1682     ewtab            = fr->ic->tabq_coul_FDV0;
1683     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1684     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1685
1686     /* Setup water-specific parameters */
1687     inr              = nlist->iinr[0];
1688     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1689     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1690     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1691
1692     jq0              = _mm256_set1_ps(charge[inr+0]);
1693     jq1              = _mm256_set1_ps(charge[inr+1]);
1694     jq2              = _mm256_set1_ps(charge[inr+2]);
1695     qq00             = _mm256_mul_ps(iq0,jq0);
1696     qq01             = _mm256_mul_ps(iq0,jq1);
1697     qq02             = _mm256_mul_ps(iq0,jq2);
1698     qq10             = _mm256_mul_ps(iq1,jq0);
1699     qq11             = _mm256_mul_ps(iq1,jq1);
1700     qq12             = _mm256_mul_ps(iq1,jq2);
1701     qq20             = _mm256_mul_ps(iq2,jq0);
1702     qq21             = _mm256_mul_ps(iq2,jq1);
1703     qq22             = _mm256_mul_ps(iq2,jq2);
1704
1705     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1706     rcutoff_scalar   = fr->rcoulomb;
1707     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1708     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1709
1710     rswitch_scalar   = fr->rcoulomb_switch;
1711     rswitch          = _mm256_set1_ps(rswitch_scalar);
1712     /* Setup switch parameters */
1713     d_scalar         = rcutoff_scalar-rswitch_scalar;
1714     d                = _mm256_set1_ps(d_scalar);
1715     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1716     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1717     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1718     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1719     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1720     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1721
1722     /* Avoid stupid compiler warnings */
1723     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1724     j_coord_offsetA = 0;
1725     j_coord_offsetB = 0;
1726     j_coord_offsetC = 0;
1727     j_coord_offsetD = 0;
1728     j_coord_offsetE = 0;
1729     j_coord_offsetF = 0;
1730     j_coord_offsetG = 0;
1731     j_coord_offsetH = 0;
1732
1733     outeriter        = 0;
1734     inneriter        = 0;
1735
1736     for(iidx=0;iidx<4*DIM;iidx++)
1737     {
1738         scratch[iidx] = 0.0;
1739     }
1740
1741     /* Start outer loop over neighborlists */
1742     for(iidx=0; iidx<nri; iidx++)
1743     {
1744         /* Load shift vector for this list */
1745         i_shift_offset   = DIM*shiftidx[iidx];
1746
1747         /* Load limits for loop over neighbors */
1748         j_index_start    = jindex[iidx];
1749         j_index_end      = jindex[iidx+1];
1750
1751         /* Get outer coordinate index */
1752         inr              = iinr[iidx];
1753         i_coord_offset   = DIM*inr;
1754
1755         /* Load i particle coords and add shift vector */
1756         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1757                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1758
1759         fix0             = _mm256_setzero_ps();
1760         fiy0             = _mm256_setzero_ps();
1761         fiz0             = _mm256_setzero_ps();
1762         fix1             = _mm256_setzero_ps();
1763         fiy1             = _mm256_setzero_ps();
1764         fiz1             = _mm256_setzero_ps();
1765         fix2             = _mm256_setzero_ps();
1766         fiy2             = _mm256_setzero_ps();
1767         fiz2             = _mm256_setzero_ps();
1768
1769         /* Start inner kernel loop */
1770         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1771         {
1772
1773             /* Get j neighbor index, and coordinate index */
1774             jnrA             = jjnr[jidx];
1775             jnrB             = jjnr[jidx+1];
1776             jnrC             = jjnr[jidx+2];
1777             jnrD             = jjnr[jidx+3];
1778             jnrE             = jjnr[jidx+4];
1779             jnrF             = jjnr[jidx+5];
1780             jnrG             = jjnr[jidx+6];
1781             jnrH             = jjnr[jidx+7];
1782             j_coord_offsetA  = DIM*jnrA;
1783             j_coord_offsetB  = DIM*jnrB;
1784             j_coord_offsetC  = DIM*jnrC;
1785             j_coord_offsetD  = DIM*jnrD;
1786             j_coord_offsetE  = DIM*jnrE;
1787             j_coord_offsetF  = DIM*jnrF;
1788             j_coord_offsetG  = DIM*jnrG;
1789             j_coord_offsetH  = DIM*jnrH;
1790
1791             /* load j atom coordinates */
1792             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1793                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1794                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1795                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1796                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1797
1798             /* Calculate displacement vector */
1799             dx00             = _mm256_sub_ps(ix0,jx0);
1800             dy00             = _mm256_sub_ps(iy0,jy0);
1801             dz00             = _mm256_sub_ps(iz0,jz0);
1802             dx01             = _mm256_sub_ps(ix0,jx1);
1803             dy01             = _mm256_sub_ps(iy0,jy1);
1804             dz01             = _mm256_sub_ps(iz0,jz1);
1805             dx02             = _mm256_sub_ps(ix0,jx2);
1806             dy02             = _mm256_sub_ps(iy0,jy2);
1807             dz02             = _mm256_sub_ps(iz0,jz2);
1808             dx10             = _mm256_sub_ps(ix1,jx0);
1809             dy10             = _mm256_sub_ps(iy1,jy0);
1810             dz10             = _mm256_sub_ps(iz1,jz0);
1811             dx11             = _mm256_sub_ps(ix1,jx1);
1812             dy11             = _mm256_sub_ps(iy1,jy1);
1813             dz11             = _mm256_sub_ps(iz1,jz1);
1814             dx12             = _mm256_sub_ps(ix1,jx2);
1815             dy12             = _mm256_sub_ps(iy1,jy2);
1816             dz12             = _mm256_sub_ps(iz1,jz2);
1817             dx20             = _mm256_sub_ps(ix2,jx0);
1818             dy20             = _mm256_sub_ps(iy2,jy0);
1819             dz20             = _mm256_sub_ps(iz2,jz0);
1820             dx21             = _mm256_sub_ps(ix2,jx1);
1821             dy21             = _mm256_sub_ps(iy2,jy1);
1822             dz21             = _mm256_sub_ps(iz2,jz1);
1823             dx22             = _mm256_sub_ps(ix2,jx2);
1824             dy22             = _mm256_sub_ps(iy2,jy2);
1825             dz22             = _mm256_sub_ps(iz2,jz2);
1826
1827             /* Calculate squared distance and things based on it */
1828             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1829             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1830             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1831             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1832             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1833             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1834             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1835             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1836             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1837
1838             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1839             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1840             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1841             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1842             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1843             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1844             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1845             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1846             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1847
1848             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1849             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1850             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1851             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1852             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1853             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1854             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1855             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1856             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1857
1858             fjx0             = _mm256_setzero_ps();
1859             fjy0             = _mm256_setzero_ps();
1860             fjz0             = _mm256_setzero_ps();
1861             fjx1             = _mm256_setzero_ps();
1862             fjy1             = _mm256_setzero_ps();
1863             fjz1             = _mm256_setzero_ps();
1864             fjx2             = _mm256_setzero_ps();
1865             fjy2             = _mm256_setzero_ps();
1866             fjz2             = _mm256_setzero_ps();
1867
1868             /**************************
1869              * CALCULATE INTERACTIONS *
1870              **************************/
1871
1872             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1873             {
1874
1875             r00              = _mm256_mul_ps(rsq00,rinv00);
1876
1877             /* EWALD ELECTROSTATICS */
1878             
1879             /* Analytical PME correction */
1880             zeta2            = _mm256_mul_ps(beta2,rsq00);
1881             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1882             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1883             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1884             felec            = _mm256_mul_ps(qq00,felec);
1885             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1886             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1887             velec            = _mm256_sub_ps(rinv00,pmecorrV);
1888             velec            = _mm256_mul_ps(qq00,velec);
1889             
1890             d                = _mm256_sub_ps(r00,rswitch);
1891             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1892             d2               = _mm256_mul_ps(d,d);
1893             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1894
1895             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1896
1897             /* Evaluate switch function */
1898             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1899             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
1900             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1901
1902             fscal            = felec;
1903
1904             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1905
1906             /* Calculate temporary vectorial force */
1907             tx               = _mm256_mul_ps(fscal,dx00);
1908             ty               = _mm256_mul_ps(fscal,dy00);
1909             tz               = _mm256_mul_ps(fscal,dz00);
1910
1911             /* Update vectorial force */
1912             fix0             = _mm256_add_ps(fix0,tx);
1913             fiy0             = _mm256_add_ps(fiy0,ty);
1914             fiz0             = _mm256_add_ps(fiz0,tz);
1915
1916             fjx0             = _mm256_add_ps(fjx0,tx);
1917             fjy0             = _mm256_add_ps(fjy0,ty);
1918             fjz0             = _mm256_add_ps(fjz0,tz);
1919
1920             }
1921
1922             /**************************
1923              * CALCULATE INTERACTIONS *
1924              **************************/
1925
1926             if (gmx_mm256_any_lt(rsq01,rcutoff2))
1927             {
1928
1929             r01              = _mm256_mul_ps(rsq01,rinv01);
1930
1931             /* EWALD ELECTROSTATICS */
1932             
1933             /* Analytical PME correction */
1934             zeta2            = _mm256_mul_ps(beta2,rsq01);
1935             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1936             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1937             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1938             felec            = _mm256_mul_ps(qq01,felec);
1939             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1940             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1941             velec            = _mm256_sub_ps(rinv01,pmecorrV);
1942             velec            = _mm256_mul_ps(qq01,velec);
1943             
1944             d                = _mm256_sub_ps(r01,rswitch);
1945             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1946             d2               = _mm256_mul_ps(d,d);
1947             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1948
1949             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1950
1951             /* Evaluate switch function */
1952             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1953             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
1954             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1955
1956             fscal            = felec;
1957
1958             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1959
1960             /* Calculate temporary vectorial force */
1961             tx               = _mm256_mul_ps(fscal,dx01);
1962             ty               = _mm256_mul_ps(fscal,dy01);
1963             tz               = _mm256_mul_ps(fscal,dz01);
1964
1965             /* Update vectorial force */
1966             fix0             = _mm256_add_ps(fix0,tx);
1967             fiy0             = _mm256_add_ps(fiy0,ty);
1968             fiz0             = _mm256_add_ps(fiz0,tz);
1969
1970             fjx1             = _mm256_add_ps(fjx1,tx);
1971             fjy1             = _mm256_add_ps(fjy1,ty);
1972             fjz1             = _mm256_add_ps(fjz1,tz);
1973
1974             }
1975
1976             /**************************
1977              * CALCULATE INTERACTIONS *
1978              **************************/
1979
1980             if (gmx_mm256_any_lt(rsq02,rcutoff2))
1981             {
1982
1983             r02              = _mm256_mul_ps(rsq02,rinv02);
1984
1985             /* EWALD ELECTROSTATICS */
1986             
1987             /* Analytical PME correction */
1988             zeta2            = _mm256_mul_ps(beta2,rsq02);
1989             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1990             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1991             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1992             felec            = _mm256_mul_ps(qq02,felec);
1993             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1994             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1995             velec            = _mm256_sub_ps(rinv02,pmecorrV);
1996             velec            = _mm256_mul_ps(qq02,velec);
1997             
1998             d                = _mm256_sub_ps(r02,rswitch);
1999             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2000             d2               = _mm256_mul_ps(d,d);
2001             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2002
2003             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2004
2005             /* Evaluate switch function */
2006             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2007             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
2008             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2009
2010             fscal            = felec;
2011
2012             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2013
2014             /* Calculate temporary vectorial force */
2015             tx               = _mm256_mul_ps(fscal,dx02);
2016             ty               = _mm256_mul_ps(fscal,dy02);
2017             tz               = _mm256_mul_ps(fscal,dz02);
2018
2019             /* Update vectorial force */
2020             fix0             = _mm256_add_ps(fix0,tx);
2021             fiy0             = _mm256_add_ps(fiy0,ty);
2022             fiz0             = _mm256_add_ps(fiz0,tz);
2023
2024             fjx2             = _mm256_add_ps(fjx2,tx);
2025             fjy2             = _mm256_add_ps(fjy2,ty);
2026             fjz2             = _mm256_add_ps(fjz2,tz);
2027
2028             }
2029
2030             /**************************
2031              * CALCULATE INTERACTIONS *
2032              **************************/
2033
2034             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2035             {
2036
2037             r10              = _mm256_mul_ps(rsq10,rinv10);
2038
2039             /* EWALD ELECTROSTATICS */
2040             
2041             /* Analytical PME correction */
2042             zeta2            = _mm256_mul_ps(beta2,rsq10);
2043             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2044             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2045             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2046             felec            = _mm256_mul_ps(qq10,felec);
2047             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2048             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2049             velec            = _mm256_sub_ps(rinv10,pmecorrV);
2050             velec            = _mm256_mul_ps(qq10,velec);
2051             
2052             d                = _mm256_sub_ps(r10,rswitch);
2053             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2054             d2               = _mm256_mul_ps(d,d);
2055             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2056
2057             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2058
2059             /* Evaluate switch function */
2060             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2061             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2062             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2063
2064             fscal            = felec;
2065
2066             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2067
2068             /* Calculate temporary vectorial force */
2069             tx               = _mm256_mul_ps(fscal,dx10);
2070             ty               = _mm256_mul_ps(fscal,dy10);
2071             tz               = _mm256_mul_ps(fscal,dz10);
2072
2073             /* Update vectorial force */
2074             fix1             = _mm256_add_ps(fix1,tx);
2075             fiy1             = _mm256_add_ps(fiy1,ty);
2076             fiz1             = _mm256_add_ps(fiz1,tz);
2077
2078             fjx0             = _mm256_add_ps(fjx0,tx);
2079             fjy0             = _mm256_add_ps(fjy0,ty);
2080             fjz0             = _mm256_add_ps(fjz0,tz);
2081
2082             }
2083
2084             /**************************
2085              * CALCULATE INTERACTIONS *
2086              **************************/
2087
2088             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2089             {
2090
2091             r11              = _mm256_mul_ps(rsq11,rinv11);
2092
2093             /* EWALD ELECTROSTATICS */
2094             
2095             /* Analytical PME correction */
2096             zeta2            = _mm256_mul_ps(beta2,rsq11);
2097             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2098             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2099             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2100             felec            = _mm256_mul_ps(qq11,felec);
2101             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2102             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2103             velec            = _mm256_sub_ps(rinv11,pmecorrV);
2104             velec            = _mm256_mul_ps(qq11,velec);
2105             
2106             d                = _mm256_sub_ps(r11,rswitch);
2107             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2108             d2               = _mm256_mul_ps(d,d);
2109             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2110
2111             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2112
2113             /* Evaluate switch function */
2114             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2115             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2116             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2117
2118             fscal            = felec;
2119
2120             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2121
2122             /* Calculate temporary vectorial force */
2123             tx               = _mm256_mul_ps(fscal,dx11);
2124             ty               = _mm256_mul_ps(fscal,dy11);
2125             tz               = _mm256_mul_ps(fscal,dz11);
2126
2127             /* Update vectorial force */
2128             fix1             = _mm256_add_ps(fix1,tx);
2129             fiy1             = _mm256_add_ps(fiy1,ty);
2130             fiz1             = _mm256_add_ps(fiz1,tz);
2131
2132             fjx1             = _mm256_add_ps(fjx1,tx);
2133             fjy1             = _mm256_add_ps(fjy1,ty);
2134             fjz1             = _mm256_add_ps(fjz1,tz);
2135
2136             }
2137
2138             /**************************
2139              * CALCULATE INTERACTIONS *
2140              **************************/
2141
2142             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2143             {
2144
2145             r12              = _mm256_mul_ps(rsq12,rinv12);
2146
2147             /* EWALD ELECTROSTATICS */
2148             
2149             /* Analytical PME correction */
2150             zeta2            = _mm256_mul_ps(beta2,rsq12);
2151             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2152             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2153             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2154             felec            = _mm256_mul_ps(qq12,felec);
2155             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2156             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2157             velec            = _mm256_sub_ps(rinv12,pmecorrV);
2158             velec            = _mm256_mul_ps(qq12,velec);
2159             
2160             d                = _mm256_sub_ps(r12,rswitch);
2161             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2162             d2               = _mm256_mul_ps(d,d);
2163             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2164
2165             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2166
2167             /* Evaluate switch function */
2168             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2169             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2170             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2171
2172             fscal            = felec;
2173
2174             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2175
2176             /* Calculate temporary vectorial force */
2177             tx               = _mm256_mul_ps(fscal,dx12);
2178             ty               = _mm256_mul_ps(fscal,dy12);
2179             tz               = _mm256_mul_ps(fscal,dz12);
2180
2181             /* Update vectorial force */
2182             fix1             = _mm256_add_ps(fix1,tx);
2183             fiy1             = _mm256_add_ps(fiy1,ty);
2184             fiz1             = _mm256_add_ps(fiz1,tz);
2185
2186             fjx2             = _mm256_add_ps(fjx2,tx);
2187             fjy2             = _mm256_add_ps(fjy2,ty);
2188             fjz2             = _mm256_add_ps(fjz2,tz);
2189
2190             }
2191
2192             /**************************
2193              * CALCULATE INTERACTIONS *
2194              **************************/
2195
2196             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2197             {
2198
2199             r20              = _mm256_mul_ps(rsq20,rinv20);
2200
2201             /* EWALD ELECTROSTATICS */
2202             
2203             /* Analytical PME correction */
2204             zeta2            = _mm256_mul_ps(beta2,rsq20);
2205             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2206             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2207             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2208             felec            = _mm256_mul_ps(qq20,felec);
2209             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2210             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2211             velec            = _mm256_sub_ps(rinv20,pmecorrV);
2212             velec            = _mm256_mul_ps(qq20,velec);
2213             
2214             d                = _mm256_sub_ps(r20,rswitch);
2215             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2216             d2               = _mm256_mul_ps(d,d);
2217             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2218
2219             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2220
2221             /* Evaluate switch function */
2222             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2223             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2224             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2225
2226             fscal            = felec;
2227
2228             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2229
2230             /* Calculate temporary vectorial force */
2231             tx               = _mm256_mul_ps(fscal,dx20);
2232             ty               = _mm256_mul_ps(fscal,dy20);
2233             tz               = _mm256_mul_ps(fscal,dz20);
2234
2235             /* Update vectorial force */
2236             fix2             = _mm256_add_ps(fix2,tx);
2237             fiy2             = _mm256_add_ps(fiy2,ty);
2238             fiz2             = _mm256_add_ps(fiz2,tz);
2239
2240             fjx0             = _mm256_add_ps(fjx0,tx);
2241             fjy0             = _mm256_add_ps(fjy0,ty);
2242             fjz0             = _mm256_add_ps(fjz0,tz);
2243
2244             }
2245
2246             /**************************
2247              * CALCULATE INTERACTIONS *
2248              **************************/
2249
2250             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2251             {
2252
2253             r21              = _mm256_mul_ps(rsq21,rinv21);
2254
2255             /* EWALD ELECTROSTATICS */
2256             
2257             /* Analytical PME correction */
2258             zeta2            = _mm256_mul_ps(beta2,rsq21);
2259             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2260             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2261             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2262             felec            = _mm256_mul_ps(qq21,felec);
2263             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2264             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2265             velec            = _mm256_sub_ps(rinv21,pmecorrV);
2266             velec            = _mm256_mul_ps(qq21,velec);
2267             
2268             d                = _mm256_sub_ps(r21,rswitch);
2269             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2270             d2               = _mm256_mul_ps(d,d);
2271             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2272
2273             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2274
2275             /* Evaluate switch function */
2276             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2277             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2278             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2279
2280             fscal            = felec;
2281
2282             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2283
2284             /* Calculate temporary vectorial force */
2285             tx               = _mm256_mul_ps(fscal,dx21);
2286             ty               = _mm256_mul_ps(fscal,dy21);
2287             tz               = _mm256_mul_ps(fscal,dz21);
2288
2289             /* Update vectorial force */
2290             fix2             = _mm256_add_ps(fix2,tx);
2291             fiy2             = _mm256_add_ps(fiy2,ty);
2292             fiz2             = _mm256_add_ps(fiz2,tz);
2293
2294             fjx1             = _mm256_add_ps(fjx1,tx);
2295             fjy1             = _mm256_add_ps(fjy1,ty);
2296             fjz1             = _mm256_add_ps(fjz1,tz);
2297
2298             }
2299
2300             /**************************
2301              * CALCULATE INTERACTIONS *
2302              **************************/
2303
2304             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2305             {
2306
2307             r22              = _mm256_mul_ps(rsq22,rinv22);
2308
2309             /* EWALD ELECTROSTATICS */
2310             
2311             /* Analytical PME correction */
2312             zeta2            = _mm256_mul_ps(beta2,rsq22);
2313             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2314             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2315             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2316             felec            = _mm256_mul_ps(qq22,felec);
2317             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2318             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2319             velec            = _mm256_sub_ps(rinv22,pmecorrV);
2320             velec            = _mm256_mul_ps(qq22,velec);
2321             
2322             d                = _mm256_sub_ps(r22,rswitch);
2323             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2324             d2               = _mm256_mul_ps(d,d);
2325             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2326
2327             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2328
2329             /* Evaluate switch function */
2330             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2331             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2332             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2333
2334             fscal            = felec;
2335
2336             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2337
2338             /* Calculate temporary vectorial force */
2339             tx               = _mm256_mul_ps(fscal,dx22);
2340             ty               = _mm256_mul_ps(fscal,dy22);
2341             tz               = _mm256_mul_ps(fscal,dz22);
2342
2343             /* Update vectorial force */
2344             fix2             = _mm256_add_ps(fix2,tx);
2345             fiy2             = _mm256_add_ps(fiy2,ty);
2346             fiz2             = _mm256_add_ps(fiz2,tz);
2347
2348             fjx2             = _mm256_add_ps(fjx2,tx);
2349             fjy2             = _mm256_add_ps(fjy2,ty);
2350             fjz2             = _mm256_add_ps(fjz2,tz);
2351
2352             }
2353
2354             fjptrA             = f+j_coord_offsetA;
2355             fjptrB             = f+j_coord_offsetB;
2356             fjptrC             = f+j_coord_offsetC;
2357             fjptrD             = f+j_coord_offsetD;
2358             fjptrE             = f+j_coord_offsetE;
2359             fjptrF             = f+j_coord_offsetF;
2360             fjptrG             = f+j_coord_offsetG;
2361             fjptrH             = f+j_coord_offsetH;
2362
2363             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2364                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2365
2366             /* Inner loop uses 945 flops */
2367         }
2368
2369         if(jidx<j_index_end)
2370         {
2371
2372             /* Get j neighbor index, and coordinate index */
2373             jnrlistA         = jjnr[jidx];
2374             jnrlistB         = jjnr[jidx+1];
2375             jnrlistC         = jjnr[jidx+2];
2376             jnrlistD         = jjnr[jidx+3];
2377             jnrlistE         = jjnr[jidx+4];
2378             jnrlistF         = jjnr[jidx+5];
2379             jnrlistG         = jjnr[jidx+6];
2380             jnrlistH         = jjnr[jidx+7];
2381             /* Sign of each element will be negative for non-real atoms.
2382              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2383              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2384              */
2385             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2386                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2387                                             
2388             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2389             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2390             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2391             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2392             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2393             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2394             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2395             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2396             j_coord_offsetA  = DIM*jnrA;
2397             j_coord_offsetB  = DIM*jnrB;
2398             j_coord_offsetC  = DIM*jnrC;
2399             j_coord_offsetD  = DIM*jnrD;
2400             j_coord_offsetE  = DIM*jnrE;
2401             j_coord_offsetF  = DIM*jnrF;
2402             j_coord_offsetG  = DIM*jnrG;
2403             j_coord_offsetH  = DIM*jnrH;
2404
2405             /* load j atom coordinates */
2406             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2407                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2408                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2409                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2410                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2411
2412             /* Calculate displacement vector */
2413             dx00             = _mm256_sub_ps(ix0,jx0);
2414             dy00             = _mm256_sub_ps(iy0,jy0);
2415             dz00             = _mm256_sub_ps(iz0,jz0);
2416             dx01             = _mm256_sub_ps(ix0,jx1);
2417             dy01             = _mm256_sub_ps(iy0,jy1);
2418             dz01             = _mm256_sub_ps(iz0,jz1);
2419             dx02             = _mm256_sub_ps(ix0,jx2);
2420             dy02             = _mm256_sub_ps(iy0,jy2);
2421             dz02             = _mm256_sub_ps(iz0,jz2);
2422             dx10             = _mm256_sub_ps(ix1,jx0);
2423             dy10             = _mm256_sub_ps(iy1,jy0);
2424             dz10             = _mm256_sub_ps(iz1,jz0);
2425             dx11             = _mm256_sub_ps(ix1,jx1);
2426             dy11             = _mm256_sub_ps(iy1,jy1);
2427             dz11             = _mm256_sub_ps(iz1,jz1);
2428             dx12             = _mm256_sub_ps(ix1,jx2);
2429             dy12             = _mm256_sub_ps(iy1,jy2);
2430             dz12             = _mm256_sub_ps(iz1,jz2);
2431             dx20             = _mm256_sub_ps(ix2,jx0);
2432             dy20             = _mm256_sub_ps(iy2,jy0);
2433             dz20             = _mm256_sub_ps(iz2,jz0);
2434             dx21             = _mm256_sub_ps(ix2,jx1);
2435             dy21             = _mm256_sub_ps(iy2,jy1);
2436             dz21             = _mm256_sub_ps(iz2,jz1);
2437             dx22             = _mm256_sub_ps(ix2,jx2);
2438             dy22             = _mm256_sub_ps(iy2,jy2);
2439             dz22             = _mm256_sub_ps(iz2,jz2);
2440
2441             /* Calculate squared distance and things based on it */
2442             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2443             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2444             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2445             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2446             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2447             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2448             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2449             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2450             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2451
2452             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
2453             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
2454             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
2455             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
2456             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2457             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2458             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
2459             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2460             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2461
2462             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
2463             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
2464             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
2465             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
2466             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2467             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2468             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
2469             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2470             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2471
2472             fjx0             = _mm256_setzero_ps();
2473             fjy0             = _mm256_setzero_ps();
2474             fjz0             = _mm256_setzero_ps();
2475             fjx1             = _mm256_setzero_ps();
2476             fjy1             = _mm256_setzero_ps();
2477             fjz1             = _mm256_setzero_ps();
2478             fjx2             = _mm256_setzero_ps();
2479             fjy2             = _mm256_setzero_ps();
2480             fjz2             = _mm256_setzero_ps();
2481
2482             /**************************
2483              * CALCULATE INTERACTIONS *
2484              **************************/
2485
2486             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2487             {
2488
2489             r00              = _mm256_mul_ps(rsq00,rinv00);
2490             r00              = _mm256_andnot_ps(dummy_mask,r00);
2491
2492             /* EWALD ELECTROSTATICS */
2493             
2494             /* Analytical PME correction */
2495             zeta2            = _mm256_mul_ps(beta2,rsq00);
2496             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
2497             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2498             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2499             felec            = _mm256_mul_ps(qq00,felec);
2500             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2501             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2502             velec            = _mm256_sub_ps(rinv00,pmecorrV);
2503             velec            = _mm256_mul_ps(qq00,velec);
2504             
2505             d                = _mm256_sub_ps(r00,rswitch);
2506             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2507             d2               = _mm256_mul_ps(d,d);
2508             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2509
2510             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2511
2512             /* Evaluate switch function */
2513             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2514             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv00,_mm256_mul_ps(velec,dsw)) );
2515             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2516
2517             fscal            = felec;
2518
2519             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2520
2521             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2522
2523             /* Calculate temporary vectorial force */
2524             tx               = _mm256_mul_ps(fscal,dx00);
2525             ty               = _mm256_mul_ps(fscal,dy00);
2526             tz               = _mm256_mul_ps(fscal,dz00);
2527
2528             /* Update vectorial force */
2529             fix0             = _mm256_add_ps(fix0,tx);
2530             fiy0             = _mm256_add_ps(fiy0,ty);
2531             fiz0             = _mm256_add_ps(fiz0,tz);
2532
2533             fjx0             = _mm256_add_ps(fjx0,tx);
2534             fjy0             = _mm256_add_ps(fjy0,ty);
2535             fjz0             = _mm256_add_ps(fjz0,tz);
2536
2537             }
2538
2539             /**************************
2540              * CALCULATE INTERACTIONS *
2541              **************************/
2542
2543             if (gmx_mm256_any_lt(rsq01,rcutoff2))
2544             {
2545
2546             r01              = _mm256_mul_ps(rsq01,rinv01);
2547             r01              = _mm256_andnot_ps(dummy_mask,r01);
2548
2549             /* EWALD ELECTROSTATICS */
2550             
2551             /* Analytical PME correction */
2552             zeta2            = _mm256_mul_ps(beta2,rsq01);
2553             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
2554             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2555             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2556             felec            = _mm256_mul_ps(qq01,felec);
2557             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2558             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2559             velec            = _mm256_sub_ps(rinv01,pmecorrV);
2560             velec            = _mm256_mul_ps(qq01,velec);
2561             
2562             d                = _mm256_sub_ps(r01,rswitch);
2563             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2564             d2               = _mm256_mul_ps(d,d);
2565             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2566
2567             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2568
2569             /* Evaluate switch function */
2570             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2571             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv01,_mm256_mul_ps(velec,dsw)) );
2572             cutoff_mask      = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2573
2574             fscal            = felec;
2575
2576             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2577
2578             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2579
2580             /* Calculate temporary vectorial force */
2581             tx               = _mm256_mul_ps(fscal,dx01);
2582             ty               = _mm256_mul_ps(fscal,dy01);
2583             tz               = _mm256_mul_ps(fscal,dz01);
2584
2585             /* Update vectorial force */
2586             fix0             = _mm256_add_ps(fix0,tx);
2587             fiy0             = _mm256_add_ps(fiy0,ty);
2588             fiz0             = _mm256_add_ps(fiz0,tz);
2589
2590             fjx1             = _mm256_add_ps(fjx1,tx);
2591             fjy1             = _mm256_add_ps(fjy1,ty);
2592             fjz1             = _mm256_add_ps(fjz1,tz);
2593
2594             }
2595
2596             /**************************
2597              * CALCULATE INTERACTIONS *
2598              **************************/
2599
2600             if (gmx_mm256_any_lt(rsq02,rcutoff2))
2601             {
2602
2603             r02              = _mm256_mul_ps(rsq02,rinv02);
2604             r02              = _mm256_andnot_ps(dummy_mask,r02);
2605
2606             /* EWALD ELECTROSTATICS */
2607             
2608             /* Analytical PME correction */
2609             zeta2            = _mm256_mul_ps(beta2,rsq02);
2610             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
2611             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2612             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2613             felec            = _mm256_mul_ps(qq02,felec);
2614             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2615             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2616             velec            = _mm256_sub_ps(rinv02,pmecorrV);
2617             velec            = _mm256_mul_ps(qq02,velec);
2618             
2619             d                = _mm256_sub_ps(r02,rswitch);
2620             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2621             d2               = _mm256_mul_ps(d,d);
2622             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2623
2624             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2625
2626             /* Evaluate switch function */
2627             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2628             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv02,_mm256_mul_ps(velec,dsw)) );
2629             cutoff_mask      = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2630
2631             fscal            = felec;
2632
2633             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2634
2635             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2636
2637             /* Calculate temporary vectorial force */
2638             tx               = _mm256_mul_ps(fscal,dx02);
2639             ty               = _mm256_mul_ps(fscal,dy02);
2640             tz               = _mm256_mul_ps(fscal,dz02);
2641
2642             /* Update vectorial force */
2643             fix0             = _mm256_add_ps(fix0,tx);
2644             fiy0             = _mm256_add_ps(fiy0,ty);
2645             fiz0             = _mm256_add_ps(fiz0,tz);
2646
2647             fjx2             = _mm256_add_ps(fjx2,tx);
2648             fjy2             = _mm256_add_ps(fjy2,ty);
2649             fjz2             = _mm256_add_ps(fjz2,tz);
2650
2651             }
2652
2653             /**************************
2654              * CALCULATE INTERACTIONS *
2655              **************************/
2656
2657             if (gmx_mm256_any_lt(rsq10,rcutoff2))
2658             {
2659
2660             r10              = _mm256_mul_ps(rsq10,rinv10);
2661             r10              = _mm256_andnot_ps(dummy_mask,r10);
2662
2663             /* EWALD ELECTROSTATICS */
2664             
2665             /* Analytical PME correction */
2666             zeta2            = _mm256_mul_ps(beta2,rsq10);
2667             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2668             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2669             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2670             felec            = _mm256_mul_ps(qq10,felec);
2671             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2672             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2673             velec            = _mm256_sub_ps(rinv10,pmecorrV);
2674             velec            = _mm256_mul_ps(qq10,velec);
2675             
2676             d                = _mm256_sub_ps(r10,rswitch);
2677             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2678             d2               = _mm256_mul_ps(d,d);
2679             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2680
2681             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2682
2683             /* Evaluate switch function */
2684             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2685             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv10,_mm256_mul_ps(velec,dsw)) );
2686             cutoff_mask      = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2687
2688             fscal            = felec;
2689
2690             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2691
2692             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2693
2694             /* Calculate temporary vectorial force */
2695             tx               = _mm256_mul_ps(fscal,dx10);
2696             ty               = _mm256_mul_ps(fscal,dy10);
2697             tz               = _mm256_mul_ps(fscal,dz10);
2698
2699             /* Update vectorial force */
2700             fix1             = _mm256_add_ps(fix1,tx);
2701             fiy1             = _mm256_add_ps(fiy1,ty);
2702             fiz1             = _mm256_add_ps(fiz1,tz);
2703
2704             fjx0             = _mm256_add_ps(fjx0,tx);
2705             fjy0             = _mm256_add_ps(fjy0,ty);
2706             fjz0             = _mm256_add_ps(fjz0,tz);
2707
2708             }
2709
2710             /**************************
2711              * CALCULATE INTERACTIONS *
2712              **************************/
2713
2714             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2715             {
2716
2717             r11              = _mm256_mul_ps(rsq11,rinv11);
2718             r11              = _mm256_andnot_ps(dummy_mask,r11);
2719
2720             /* EWALD ELECTROSTATICS */
2721             
2722             /* Analytical PME correction */
2723             zeta2            = _mm256_mul_ps(beta2,rsq11);
2724             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2725             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2726             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2727             felec            = _mm256_mul_ps(qq11,felec);
2728             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2729             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2730             velec            = _mm256_sub_ps(rinv11,pmecorrV);
2731             velec            = _mm256_mul_ps(qq11,velec);
2732             
2733             d                = _mm256_sub_ps(r11,rswitch);
2734             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2735             d2               = _mm256_mul_ps(d,d);
2736             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2737
2738             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2739
2740             /* Evaluate switch function */
2741             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2742             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2743             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2744
2745             fscal            = felec;
2746
2747             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2748
2749             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2750
2751             /* Calculate temporary vectorial force */
2752             tx               = _mm256_mul_ps(fscal,dx11);
2753             ty               = _mm256_mul_ps(fscal,dy11);
2754             tz               = _mm256_mul_ps(fscal,dz11);
2755
2756             /* Update vectorial force */
2757             fix1             = _mm256_add_ps(fix1,tx);
2758             fiy1             = _mm256_add_ps(fiy1,ty);
2759             fiz1             = _mm256_add_ps(fiz1,tz);
2760
2761             fjx1             = _mm256_add_ps(fjx1,tx);
2762             fjy1             = _mm256_add_ps(fjy1,ty);
2763             fjz1             = _mm256_add_ps(fjz1,tz);
2764
2765             }
2766
2767             /**************************
2768              * CALCULATE INTERACTIONS *
2769              **************************/
2770
2771             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2772             {
2773
2774             r12              = _mm256_mul_ps(rsq12,rinv12);
2775             r12              = _mm256_andnot_ps(dummy_mask,r12);
2776
2777             /* EWALD ELECTROSTATICS */
2778             
2779             /* Analytical PME correction */
2780             zeta2            = _mm256_mul_ps(beta2,rsq12);
2781             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2782             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2783             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2784             felec            = _mm256_mul_ps(qq12,felec);
2785             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2786             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2787             velec            = _mm256_sub_ps(rinv12,pmecorrV);
2788             velec            = _mm256_mul_ps(qq12,velec);
2789             
2790             d                = _mm256_sub_ps(r12,rswitch);
2791             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2792             d2               = _mm256_mul_ps(d,d);
2793             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2794
2795             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2796
2797             /* Evaluate switch function */
2798             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2799             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2800             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2801
2802             fscal            = felec;
2803
2804             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2805
2806             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2807
2808             /* Calculate temporary vectorial force */
2809             tx               = _mm256_mul_ps(fscal,dx12);
2810             ty               = _mm256_mul_ps(fscal,dy12);
2811             tz               = _mm256_mul_ps(fscal,dz12);
2812
2813             /* Update vectorial force */
2814             fix1             = _mm256_add_ps(fix1,tx);
2815             fiy1             = _mm256_add_ps(fiy1,ty);
2816             fiz1             = _mm256_add_ps(fiz1,tz);
2817
2818             fjx2             = _mm256_add_ps(fjx2,tx);
2819             fjy2             = _mm256_add_ps(fjy2,ty);
2820             fjz2             = _mm256_add_ps(fjz2,tz);
2821
2822             }
2823
2824             /**************************
2825              * CALCULATE INTERACTIONS *
2826              **************************/
2827
2828             if (gmx_mm256_any_lt(rsq20,rcutoff2))
2829             {
2830
2831             r20              = _mm256_mul_ps(rsq20,rinv20);
2832             r20              = _mm256_andnot_ps(dummy_mask,r20);
2833
2834             /* EWALD ELECTROSTATICS */
2835             
2836             /* Analytical PME correction */
2837             zeta2            = _mm256_mul_ps(beta2,rsq20);
2838             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2839             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2840             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2841             felec            = _mm256_mul_ps(qq20,felec);
2842             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2843             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2844             velec            = _mm256_sub_ps(rinv20,pmecorrV);
2845             velec            = _mm256_mul_ps(qq20,velec);
2846             
2847             d                = _mm256_sub_ps(r20,rswitch);
2848             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2849             d2               = _mm256_mul_ps(d,d);
2850             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2851
2852             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2853
2854             /* Evaluate switch function */
2855             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2856             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv20,_mm256_mul_ps(velec,dsw)) );
2857             cutoff_mask      = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2858
2859             fscal            = felec;
2860
2861             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2862
2863             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2864
2865             /* Calculate temporary vectorial force */
2866             tx               = _mm256_mul_ps(fscal,dx20);
2867             ty               = _mm256_mul_ps(fscal,dy20);
2868             tz               = _mm256_mul_ps(fscal,dz20);
2869
2870             /* Update vectorial force */
2871             fix2             = _mm256_add_ps(fix2,tx);
2872             fiy2             = _mm256_add_ps(fiy2,ty);
2873             fiz2             = _mm256_add_ps(fiz2,tz);
2874
2875             fjx0             = _mm256_add_ps(fjx0,tx);
2876             fjy0             = _mm256_add_ps(fjy0,ty);
2877             fjz0             = _mm256_add_ps(fjz0,tz);
2878
2879             }
2880
2881             /**************************
2882              * CALCULATE INTERACTIONS *
2883              **************************/
2884
2885             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2886             {
2887
2888             r21              = _mm256_mul_ps(rsq21,rinv21);
2889             r21              = _mm256_andnot_ps(dummy_mask,r21);
2890
2891             /* EWALD ELECTROSTATICS */
2892             
2893             /* Analytical PME correction */
2894             zeta2            = _mm256_mul_ps(beta2,rsq21);
2895             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2896             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2897             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2898             felec            = _mm256_mul_ps(qq21,felec);
2899             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2900             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2901             velec            = _mm256_sub_ps(rinv21,pmecorrV);
2902             velec            = _mm256_mul_ps(qq21,velec);
2903             
2904             d                = _mm256_sub_ps(r21,rswitch);
2905             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2906             d2               = _mm256_mul_ps(d,d);
2907             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2908
2909             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2910
2911             /* Evaluate switch function */
2912             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2913             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2914             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2915
2916             fscal            = felec;
2917
2918             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2919
2920             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2921
2922             /* Calculate temporary vectorial force */
2923             tx               = _mm256_mul_ps(fscal,dx21);
2924             ty               = _mm256_mul_ps(fscal,dy21);
2925             tz               = _mm256_mul_ps(fscal,dz21);
2926
2927             /* Update vectorial force */
2928             fix2             = _mm256_add_ps(fix2,tx);
2929             fiy2             = _mm256_add_ps(fiy2,ty);
2930             fiz2             = _mm256_add_ps(fiz2,tz);
2931
2932             fjx1             = _mm256_add_ps(fjx1,tx);
2933             fjy1             = _mm256_add_ps(fjy1,ty);
2934             fjz1             = _mm256_add_ps(fjz1,tz);
2935
2936             }
2937
2938             /**************************
2939              * CALCULATE INTERACTIONS *
2940              **************************/
2941
2942             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2943             {
2944
2945             r22              = _mm256_mul_ps(rsq22,rinv22);
2946             r22              = _mm256_andnot_ps(dummy_mask,r22);
2947
2948             /* EWALD ELECTROSTATICS */
2949             
2950             /* Analytical PME correction */
2951             zeta2            = _mm256_mul_ps(beta2,rsq22);
2952             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2953             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2954             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2955             felec            = _mm256_mul_ps(qq22,felec);
2956             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2957             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2958             velec            = _mm256_sub_ps(rinv22,pmecorrV);
2959             velec            = _mm256_mul_ps(qq22,velec);
2960             
2961             d                = _mm256_sub_ps(r22,rswitch);
2962             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2963             d2               = _mm256_mul_ps(d,d);
2964             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2965
2966             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2967
2968             /* Evaluate switch function */
2969             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2970             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2971             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2972
2973             fscal            = felec;
2974
2975             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2976
2977             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2978
2979             /* Calculate temporary vectorial force */
2980             tx               = _mm256_mul_ps(fscal,dx22);
2981             ty               = _mm256_mul_ps(fscal,dy22);
2982             tz               = _mm256_mul_ps(fscal,dz22);
2983
2984             /* Update vectorial force */
2985             fix2             = _mm256_add_ps(fix2,tx);
2986             fiy2             = _mm256_add_ps(fiy2,ty);
2987             fiz2             = _mm256_add_ps(fiz2,tz);
2988
2989             fjx2             = _mm256_add_ps(fjx2,tx);
2990             fjy2             = _mm256_add_ps(fjy2,ty);
2991             fjz2             = _mm256_add_ps(fjz2,tz);
2992
2993             }
2994
2995             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2996             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2997             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2998             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2999             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3000             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3001             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3002             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3003
3004             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
3005                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
3006
3007             /* Inner loop uses 954 flops */
3008         }
3009
3010         /* End of innermost loop */
3011
3012         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
3013                                                  f+i_coord_offset,fshift+i_shift_offset);
3014
3015         /* Increment number of inner iterations */
3016         inneriter                  += j_index_end - j_index_start;
3017
3018         /* Outer loop uses 18 flops */
3019     }
3020
3021     /* Increment number of outer iterations */
3022     outeriter        += nri;
3023
3024     /* Update outer/inner flops */
3025
3026     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*954);
3027 }