Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEwSw_VdwNone_GeomW4W4_avx_256_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            None
53  * Geometry:                   Water4-Water4
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrE,jnrF,jnrG,jnrH;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
83     real             scratch[4*DIM];
84     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     real *           vdwioffsetptr1;
86     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87     real *           vdwioffsetptr2;
88     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89     real *           vdwioffsetptr3;
90     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
92     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
93     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
94     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
95     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
96     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
97     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
98     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
99     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
100     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
101     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
102     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
103     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
104     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
105     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
106     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     __m256i          ewitab;
109     __m128i          ewitab_lo,ewitab_hi;
110     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
111     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
112     real             *ewtab;
113     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
114     real             rswitch_scalar,d_scalar;
115     __m256           dummy_mask,cutoff_mask;
116     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
117     __m256           one     = _mm256_set1_ps(1.0);
118     __m256           two     = _mm256_set1_ps(2.0);
119     x                = xx[0];
120     f                = ff[0];
121
122     nri              = nlist->nri;
123     iinr             = nlist->iinr;
124     jindex           = nlist->jindex;
125     jjnr             = nlist->jjnr;
126     shiftidx         = nlist->shift;
127     gid              = nlist->gid;
128     shiftvec         = fr->shift_vec[0];
129     fshift           = fr->fshift[0];
130     facel            = _mm256_set1_ps(fr->ic->epsfac);
131     charge           = mdatoms->chargeA;
132
133     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
134     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
135     beta2            = _mm256_mul_ps(beta,beta);
136     beta3            = _mm256_mul_ps(beta,beta2);
137
138     ewtab            = fr->ic->tabq_coul_FDV0;
139     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
140     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
141
142     /* Setup water-specific parameters */
143     inr              = nlist->iinr[0];
144     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
145     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
146     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
147
148     jq1              = _mm256_set1_ps(charge[inr+1]);
149     jq2              = _mm256_set1_ps(charge[inr+2]);
150     jq3              = _mm256_set1_ps(charge[inr+3]);
151     qq11             = _mm256_mul_ps(iq1,jq1);
152     qq12             = _mm256_mul_ps(iq1,jq2);
153     qq13             = _mm256_mul_ps(iq1,jq3);
154     qq21             = _mm256_mul_ps(iq2,jq1);
155     qq22             = _mm256_mul_ps(iq2,jq2);
156     qq23             = _mm256_mul_ps(iq2,jq3);
157     qq31             = _mm256_mul_ps(iq3,jq1);
158     qq32             = _mm256_mul_ps(iq3,jq2);
159     qq33             = _mm256_mul_ps(iq3,jq3);
160
161     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
162     rcutoff_scalar   = fr->ic->rcoulomb;
163     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
164     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
165
166     rswitch_scalar   = fr->ic->rcoulomb_switch;
167     rswitch          = _mm256_set1_ps(rswitch_scalar);
168     /* Setup switch parameters */
169     d_scalar         = rcutoff_scalar-rswitch_scalar;
170     d                = _mm256_set1_ps(d_scalar);
171     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
172     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
173     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
174     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
175     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
176     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
177
178     /* Avoid stupid compiler warnings */
179     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
180     j_coord_offsetA = 0;
181     j_coord_offsetB = 0;
182     j_coord_offsetC = 0;
183     j_coord_offsetD = 0;
184     j_coord_offsetE = 0;
185     j_coord_offsetF = 0;
186     j_coord_offsetG = 0;
187     j_coord_offsetH = 0;
188
189     outeriter        = 0;
190     inneriter        = 0;
191
192     for(iidx=0;iidx<4*DIM;iidx++)
193     {
194         scratch[iidx] = 0.0;
195     }
196
197     /* Start outer loop over neighborlists */
198     for(iidx=0; iidx<nri; iidx++)
199     {
200         /* Load shift vector for this list */
201         i_shift_offset   = DIM*shiftidx[iidx];
202
203         /* Load limits for loop over neighbors */
204         j_index_start    = jindex[iidx];
205         j_index_end      = jindex[iidx+1];
206
207         /* Get outer coordinate index */
208         inr              = iinr[iidx];
209         i_coord_offset   = DIM*inr;
210
211         /* Load i particle coords and add shift vector */
212         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
213                                                     &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
214
215         fix1             = _mm256_setzero_ps();
216         fiy1             = _mm256_setzero_ps();
217         fiz1             = _mm256_setzero_ps();
218         fix2             = _mm256_setzero_ps();
219         fiy2             = _mm256_setzero_ps();
220         fiz2             = _mm256_setzero_ps();
221         fix3             = _mm256_setzero_ps();
222         fiy3             = _mm256_setzero_ps();
223         fiz3             = _mm256_setzero_ps();
224
225         /* Reset potential sums */
226         velecsum         = _mm256_setzero_ps();
227
228         /* Start inner kernel loop */
229         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
230         {
231
232             /* Get j neighbor index, and coordinate index */
233             jnrA             = jjnr[jidx];
234             jnrB             = jjnr[jidx+1];
235             jnrC             = jjnr[jidx+2];
236             jnrD             = jjnr[jidx+3];
237             jnrE             = jjnr[jidx+4];
238             jnrF             = jjnr[jidx+5];
239             jnrG             = jjnr[jidx+6];
240             jnrH             = jjnr[jidx+7];
241             j_coord_offsetA  = DIM*jnrA;
242             j_coord_offsetB  = DIM*jnrB;
243             j_coord_offsetC  = DIM*jnrC;
244             j_coord_offsetD  = DIM*jnrD;
245             j_coord_offsetE  = DIM*jnrE;
246             j_coord_offsetF  = DIM*jnrF;
247             j_coord_offsetG  = DIM*jnrG;
248             j_coord_offsetH  = DIM*jnrH;
249
250             /* load j atom coordinates */
251             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
252                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
253                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
254                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
255                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
256
257             /* Calculate displacement vector */
258             dx11             = _mm256_sub_ps(ix1,jx1);
259             dy11             = _mm256_sub_ps(iy1,jy1);
260             dz11             = _mm256_sub_ps(iz1,jz1);
261             dx12             = _mm256_sub_ps(ix1,jx2);
262             dy12             = _mm256_sub_ps(iy1,jy2);
263             dz12             = _mm256_sub_ps(iz1,jz2);
264             dx13             = _mm256_sub_ps(ix1,jx3);
265             dy13             = _mm256_sub_ps(iy1,jy3);
266             dz13             = _mm256_sub_ps(iz1,jz3);
267             dx21             = _mm256_sub_ps(ix2,jx1);
268             dy21             = _mm256_sub_ps(iy2,jy1);
269             dz21             = _mm256_sub_ps(iz2,jz1);
270             dx22             = _mm256_sub_ps(ix2,jx2);
271             dy22             = _mm256_sub_ps(iy2,jy2);
272             dz22             = _mm256_sub_ps(iz2,jz2);
273             dx23             = _mm256_sub_ps(ix2,jx3);
274             dy23             = _mm256_sub_ps(iy2,jy3);
275             dz23             = _mm256_sub_ps(iz2,jz3);
276             dx31             = _mm256_sub_ps(ix3,jx1);
277             dy31             = _mm256_sub_ps(iy3,jy1);
278             dz31             = _mm256_sub_ps(iz3,jz1);
279             dx32             = _mm256_sub_ps(ix3,jx2);
280             dy32             = _mm256_sub_ps(iy3,jy2);
281             dz32             = _mm256_sub_ps(iz3,jz2);
282             dx33             = _mm256_sub_ps(ix3,jx3);
283             dy33             = _mm256_sub_ps(iy3,jy3);
284             dz33             = _mm256_sub_ps(iz3,jz3);
285
286             /* Calculate squared distance and things based on it */
287             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
288             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
289             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
290             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
291             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
292             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
293             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
294             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
295             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
296
297             rinv11           = avx256_invsqrt_f(rsq11);
298             rinv12           = avx256_invsqrt_f(rsq12);
299             rinv13           = avx256_invsqrt_f(rsq13);
300             rinv21           = avx256_invsqrt_f(rsq21);
301             rinv22           = avx256_invsqrt_f(rsq22);
302             rinv23           = avx256_invsqrt_f(rsq23);
303             rinv31           = avx256_invsqrt_f(rsq31);
304             rinv32           = avx256_invsqrt_f(rsq32);
305             rinv33           = avx256_invsqrt_f(rsq33);
306
307             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
308             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
309             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
310             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
311             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
312             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
313             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
314             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
315             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
316
317             fjx1             = _mm256_setzero_ps();
318             fjy1             = _mm256_setzero_ps();
319             fjz1             = _mm256_setzero_ps();
320             fjx2             = _mm256_setzero_ps();
321             fjy2             = _mm256_setzero_ps();
322             fjz2             = _mm256_setzero_ps();
323             fjx3             = _mm256_setzero_ps();
324             fjy3             = _mm256_setzero_ps();
325             fjz3             = _mm256_setzero_ps();
326
327             /**************************
328              * CALCULATE INTERACTIONS *
329              **************************/
330
331             if (gmx_mm256_any_lt(rsq11,rcutoff2))
332             {
333
334             r11              = _mm256_mul_ps(rsq11,rinv11);
335
336             /* EWALD ELECTROSTATICS */
337             
338             /* Analytical PME correction */
339             zeta2            = _mm256_mul_ps(beta2,rsq11);
340             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
341             pmecorrF         = avx256_pmecorrF_f(zeta2);
342             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
343             felec            = _mm256_mul_ps(qq11,felec);
344             pmecorrV         = avx256_pmecorrV_f(zeta2);
345             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
346             velec            = _mm256_sub_ps(rinv11,pmecorrV);
347             velec            = _mm256_mul_ps(qq11,velec);
348             
349             d                = _mm256_sub_ps(r11,rswitch);
350             d                = _mm256_max_ps(d,_mm256_setzero_ps());
351             d2               = _mm256_mul_ps(d,d);
352             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
353
354             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
355
356             /* Evaluate switch function */
357             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
358             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
359             velec            = _mm256_mul_ps(velec,sw);
360             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
361
362             /* Update potential sum for this i atom from the interaction with this j atom. */
363             velec            = _mm256_and_ps(velec,cutoff_mask);
364             velecsum         = _mm256_add_ps(velecsum,velec);
365
366             fscal            = felec;
367
368             fscal            = _mm256_and_ps(fscal,cutoff_mask);
369
370             /* Calculate temporary vectorial force */
371             tx               = _mm256_mul_ps(fscal,dx11);
372             ty               = _mm256_mul_ps(fscal,dy11);
373             tz               = _mm256_mul_ps(fscal,dz11);
374
375             /* Update vectorial force */
376             fix1             = _mm256_add_ps(fix1,tx);
377             fiy1             = _mm256_add_ps(fiy1,ty);
378             fiz1             = _mm256_add_ps(fiz1,tz);
379
380             fjx1             = _mm256_add_ps(fjx1,tx);
381             fjy1             = _mm256_add_ps(fjy1,ty);
382             fjz1             = _mm256_add_ps(fjz1,tz);
383
384             }
385
386             /**************************
387              * CALCULATE INTERACTIONS *
388              **************************/
389
390             if (gmx_mm256_any_lt(rsq12,rcutoff2))
391             {
392
393             r12              = _mm256_mul_ps(rsq12,rinv12);
394
395             /* EWALD ELECTROSTATICS */
396             
397             /* Analytical PME correction */
398             zeta2            = _mm256_mul_ps(beta2,rsq12);
399             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
400             pmecorrF         = avx256_pmecorrF_f(zeta2);
401             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
402             felec            = _mm256_mul_ps(qq12,felec);
403             pmecorrV         = avx256_pmecorrV_f(zeta2);
404             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
405             velec            = _mm256_sub_ps(rinv12,pmecorrV);
406             velec            = _mm256_mul_ps(qq12,velec);
407             
408             d                = _mm256_sub_ps(r12,rswitch);
409             d                = _mm256_max_ps(d,_mm256_setzero_ps());
410             d2               = _mm256_mul_ps(d,d);
411             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
412
413             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
414
415             /* Evaluate switch function */
416             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
417             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
418             velec            = _mm256_mul_ps(velec,sw);
419             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
420
421             /* Update potential sum for this i atom from the interaction with this j atom. */
422             velec            = _mm256_and_ps(velec,cutoff_mask);
423             velecsum         = _mm256_add_ps(velecsum,velec);
424
425             fscal            = felec;
426
427             fscal            = _mm256_and_ps(fscal,cutoff_mask);
428
429             /* Calculate temporary vectorial force */
430             tx               = _mm256_mul_ps(fscal,dx12);
431             ty               = _mm256_mul_ps(fscal,dy12);
432             tz               = _mm256_mul_ps(fscal,dz12);
433
434             /* Update vectorial force */
435             fix1             = _mm256_add_ps(fix1,tx);
436             fiy1             = _mm256_add_ps(fiy1,ty);
437             fiz1             = _mm256_add_ps(fiz1,tz);
438
439             fjx2             = _mm256_add_ps(fjx2,tx);
440             fjy2             = _mm256_add_ps(fjy2,ty);
441             fjz2             = _mm256_add_ps(fjz2,tz);
442
443             }
444
445             /**************************
446              * CALCULATE INTERACTIONS *
447              **************************/
448
449             if (gmx_mm256_any_lt(rsq13,rcutoff2))
450             {
451
452             r13              = _mm256_mul_ps(rsq13,rinv13);
453
454             /* EWALD ELECTROSTATICS */
455             
456             /* Analytical PME correction */
457             zeta2            = _mm256_mul_ps(beta2,rsq13);
458             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
459             pmecorrF         = avx256_pmecorrF_f(zeta2);
460             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
461             felec            = _mm256_mul_ps(qq13,felec);
462             pmecorrV         = avx256_pmecorrV_f(zeta2);
463             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
464             velec            = _mm256_sub_ps(rinv13,pmecorrV);
465             velec            = _mm256_mul_ps(qq13,velec);
466             
467             d                = _mm256_sub_ps(r13,rswitch);
468             d                = _mm256_max_ps(d,_mm256_setzero_ps());
469             d2               = _mm256_mul_ps(d,d);
470             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
471
472             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
473
474             /* Evaluate switch function */
475             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
476             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
477             velec            = _mm256_mul_ps(velec,sw);
478             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
479
480             /* Update potential sum for this i atom from the interaction with this j atom. */
481             velec            = _mm256_and_ps(velec,cutoff_mask);
482             velecsum         = _mm256_add_ps(velecsum,velec);
483
484             fscal            = felec;
485
486             fscal            = _mm256_and_ps(fscal,cutoff_mask);
487
488             /* Calculate temporary vectorial force */
489             tx               = _mm256_mul_ps(fscal,dx13);
490             ty               = _mm256_mul_ps(fscal,dy13);
491             tz               = _mm256_mul_ps(fscal,dz13);
492
493             /* Update vectorial force */
494             fix1             = _mm256_add_ps(fix1,tx);
495             fiy1             = _mm256_add_ps(fiy1,ty);
496             fiz1             = _mm256_add_ps(fiz1,tz);
497
498             fjx3             = _mm256_add_ps(fjx3,tx);
499             fjy3             = _mm256_add_ps(fjy3,ty);
500             fjz3             = _mm256_add_ps(fjz3,tz);
501
502             }
503
504             /**************************
505              * CALCULATE INTERACTIONS *
506              **************************/
507
508             if (gmx_mm256_any_lt(rsq21,rcutoff2))
509             {
510
511             r21              = _mm256_mul_ps(rsq21,rinv21);
512
513             /* EWALD ELECTROSTATICS */
514             
515             /* Analytical PME correction */
516             zeta2            = _mm256_mul_ps(beta2,rsq21);
517             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
518             pmecorrF         = avx256_pmecorrF_f(zeta2);
519             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
520             felec            = _mm256_mul_ps(qq21,felec);
521             pmecorrV         = avx256_pmecorrV_f(zeta2);
522             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
523             velec            = _mm256_sub_ps(rinv21,pmecorrV);
524             velec            = _mm256_mul_ps(qq21,velec);
525             
526             d                = _mm256_sub_ps(r21,rswitch);
527             d                = _mm256_max_ps(d,_mm256_setzero_ps());
528             d2               = _mm256_mul_ps(d,d);
529             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
530
531             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
532
533             /* Evaluate switch function */
534             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
535             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
536             velec            = _mm256_mul_ps(velec,sw);
537             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
538
539             /* Update potential sum for this i atom from the interaction with this j atom. */
540             velec            = _mm256_and_ps(velec,cutoff_mask);
541             velecsum         = _mm256_add_ps(velecsum,velec);
542
543             fscal            = felec;
544
545             fscal            = _mm256_and_ps(fscal,cutoff_mask);
546
547             /* Calculate temporary vectorial force */
548             tx               = _mm256_mul_ps(fscal,dx21);
549             ty               = _mm256_mul_ps(fscal,dy21);
550             tz               = _mm256_mul_ps(fscal,dz21);
551
552             /* Update vectorial force */
553             fix2             = _mm256_add_ps(fix2,tx);
554             fiy2             = _mm256_add_ps(fiy2,ty);
555             fiz2             = _mm256_add_ps(fiz2,tz);
556
557             fjx1             = _mm256_add_ps(fjx1,tx);
558             fjy1             = _mm256_add_ps(fjy1,ty);
559             fjz1             = _mm256_add_ps(fjz1,tz);
560
561             }
562
563             /**************************
564              * CALCULATE INTERACTIONS *
565              **************************/
566
567             if (gmx_mm256_any_lt(rsq22,rcutoff2))
568             {
569
570             r22              = _mm256_mul_ps(rsq22,rinv22);
571
572             /* EWALD ELECTROSTATICS */
573             
574             /* Analytical PME correction */
575             zeta2            = _mm256_mul_ps(beta2,rsq22);
576             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
577             pmecorrF         = avx256_pmecorrF_f(zeta2);
578             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
579             felec            = _mm256_mul_ps(qq22,felec);
580             pmecorrV         = avx256_pmecorrV_f(zeta2);
581             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
582             velec            = _mm256_sub_ps(rinv22,pmecorrV);
583             velec            = _mm256_mul_ps(qq22,velec);
584             
585             d                = _mm256_sub_ps(r22,rswitch);
586             d                = _mm256_max_ps(d,_mm256_setzero_ps());
587             d2               = _mm256_mul_ps(d,d);
588             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
589
590             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
591
592             /* Evaluate switch function */
593             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
594             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
595             velec            = _mm256_mul_ps(velec,sw);
596             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
597
598             /* Update potential sum for this i atom from the interaction with this j atom. */
599             velec            = _mm256_and_ps(velec,cutoff_mask);
600             velecsum         = _mm256_add_ps(velecsum,velec);
601
602             fscal            = felec;
603
604             fscal            = _mm256_and_ps(fscal,cutoff_mask);
605
606             /* Calculate temporary vectorial force */
607             tx               = _mm256_mul_ps(fscal,dx22);
608             ty               = _mm256_mul_ps(fscal,dy22);
609             tz               = _mm256_mul_ps(fscal,dz22);
610
611             /* Update vectorial force */
612             fix2             = _mm256_add_ps(fix2,tx);
613             fiy2             = _mm256_add_ps(fiy2,ty);
614             fiz2             = _mm256_add_ps(fiz2,tz);
615
616             fjx2             = _mm256_add_ps(fjx2,tx);
617             fjy2             = _mm256_add_ps(fjy2,ty);
618             fjz2             = _mm256_add_ps(fjz2,tz);
619
620             }
621
622             /**************************
623              * CALCULATE INTERACTIONS *
624              **************************/
625
626             if (gmx_mm256_any_lt(rsq23,rcutoff2))
627             {
628
629             r23              = _mm256_mul_ps(rsq23,rinv23);
630
631             /* EWALD ELECTROSTATICS */
632             
633             /* Analytical PME correction */
634             zeta2            = _mm256_mul_ps(beta2,rsq23);
635             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
636             pmecorrF         = avx256_pmecorrF_f(zeta2);
637             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
638             felec            = _mm256_mul_ps(qq23,felec);
639             pmecorrV         = avx256_pmecorrV_f(zeta2);
640             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
641             velec            = _mm256_sub_ps(rinv23,pmecorrV);
642             velec            = _mm256_mul_ps(qq23,velec);
643             
644             d                = _mm256_sub_ps(r23,rswitch);
645             d                = _mm256_max_ps(d,_mm256_setzero_ps());
646             d2               = _mm256_mul_ps(d,d);
647             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
648
649             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
650
651             /* Evaluate switch function */
652             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
653             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
654             velec            = _mm256_mul_ps(velec,sw);
655             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
656
657             /* Update potential sum for this i atom from the interaction with this j atom. */
658             velec            = _mm256_and_ps(velec,cutoff_mask);
659             velecsum         = _mm256_add_ps(velecsum,velec);
660
661             fscal            = felec;
662
663             fscal            = _mm256_and_ps(fscal,cutoff_mask);
664
665             /* Calculate temporary vectorial force */
666             tx               = _mm256_mul_ps(fscal,dx23);
667             ty               = _mm256_mul_ps(fscal,dy23);
668             tz               = _mm256_mul_ps(fscal,dz23);
669
670             /* Update vectorial force */
671             fix2             = _mm256_add_ps(fix2,tx);
672             fiy2             = _mm256_add_ps(fiy2,ty);
673             fiz2             = _mm256_add_ps(fiz2,tz);
674
675             fjx3             = _mm256_add_ps(fjx3,tx);
676             fjy3             = _mm256_add_ps(fjy3,ty);
677             fjz3             = _mm256_add_ps(fjz3,tz);
678
679             }
680
681             /**************************
682              * CALCULATE INTERACTIONS *
683              **************************/
684
685             if (gmx_mm256_any_lt(rsq31,rcutoff2))
686             {
687
688             r31              = _mm256_mul_ps(rsq31,rinv31);
689
690             /* EWALD ELECTROSTATICS */
691             
692             /* Analytical PME correction */
693             zeta2            = _mm256_mul_ps(beta2,rsq31);
694             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
695             pmecorrF         = avx256_pmecorrF_f(zeta2);
696             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
697             felec            = _mm256_mul_ps(qq31,felec);
698             pmecorrV         = avx256_pmecorrV_f(zeta2);
699             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
700             velec            = _mm256_sub_ps(rinv31,pmecorrV);
701             velec            = _mm256_mul_ps(qq31,velec);
702             
703             d                = _mm256_sub_ps(r31,rswitch);
704             d                = _mm256_max_ps(d,_mm256_setzero_ps());
705             d2               = _mm256_mul_ps(d,d);
706             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
707
708             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
709
710             /* Evaluate switch function */
711             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
712             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
713             velec            = _mm256_mul_ps(velec,sw);
714             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
715
716             /* Update potential sum for this i atom from the interaction with this j atom. */
717             velec            = _mm256_and_ps(velec,cutoff_mask);
718             velecsum         = _mm256_add_ps(velecsum,velec);
719
720             fscal            = felec;
721
722             fscal            = _mm256_and_ps(fscal,cutoff_mask);
723
724             /* Calculate temporary vectorial force */
725             tx               = _mm256_mul_ps(fscal,dx31);
726             ty               = _mm256_mul_ps(fscal,dy31);
727             tz               = _mm256_mul_ps(fscal,dz31);
728
729             /* Update vectorial force */
730             fix3             = _mm256_add_ps(fix3,tx);
731             fiy3             = _mm256_add_ps(fiy3,ty);
732             fiz3             = _mm256_add_ps(fiz3,tz);
733
734             fjx1             = _mm256_add_ps(fjx1,tx);
735             fjy1             = _mm256_add_ps(fjy1,ty);
736             fjz1             = _mm256_add_ps(fjz1,tz);
737
738             }
739
740             /**************************
741              * CALCULATE INTERACTIONS *
742              **************************/
743
744             if (gmx_mm256_any_lt(rsq32,rcutoff2))
745             {
746
747             r32              = _mm256_mul_ps(rsq32,rinv32);
748
749             /* EWALD ELECTROSTATICS */
750             
751             /* Analytical PME correction */
752             zeta2            = _mm256_mul_ps(beta2,rsq32);
753             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
754             pmecorrF         = avx256_pmecorrF_f(zeta2);
755             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
756             felec            = _mm256_mul_ps(qq32,felec);
757             pmecorrV         = avx256_pmecorrV_f(zeta2);
758             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
759             velec            = _mm256_sub_ps(rinv32,pmecorrV);
760             velec            = _mm256_mul_ps(qq32,velec);
761             
762             d                = _mm256_sub_ps(r32,rswitch);
763             d                = _mm256_max_ps(d,_mm256_setzero_ps());
764             d2               = _mm256_mul_ps(d,d);
765             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
766
767             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
768
769             /* Evaluate switch function */
770             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
771             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
772             velec            = _mm256_mul_ps(velec,sw);
773             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
774
775             /* Update potential sum for this i atom from the interaction with this j atom. */
776             velec            = _mm256_and_ps(velec,cutoff_mask);
777             velecsum         = _mm256_add_ps(velecsum,velec);
778
779             fscal            = felec;
780
781             fscal            = _mm256_and_ps(fscal,cutoff_mask);
782
783             /* Calculate temporary vectorial force */
784             tx               = _mm256_mul_ps(fscal,dx32);
785             ty               = _mm256_mul_ps(fscal,dy32);
786             tz               = _mm256_mul_ps(fscal,dz32);
787
788             /* Update vectorial force */
789             fix3             = _mm256_add_ps(fix3,tx);
790             fiy3             = _mm256_add_ps(fiy3,ty);
791             fiz3             = _mm256_add_ps(fiz3,tz);
792
793             fjx2             = _mm256_add_ps(fjx2,tx);
794             fjy2             = _mm256_add_ps(fjy2,ty);
795             fjz2             = _mm256_add_ps(fjz2,tz);
796
797             }
798
799             /**************************
800              * CALCULATE INTERACTIONS *
801              **************************/
802
803             if (gmx_mm256_any_lt(rsq33,rcutoff2))
804             {
805
806             r33              = _mm256_mul_ps(rsq33,rinv33);
807
808             /* EWALD ELECTROSTATICS */
809             
810             /* Analytical PME correction */
811             zeta2            = _mm256_mul_ps(beta2,rsq33);
812             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
813             pmecorrF         = avx256_pmecorrF_f(zeta2);
814             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
815             felec            = _mm256_mul_ps(qq33,felec);
816             pmecorrV         = avx256_pmecorrV_f(zeta2);
817             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
818             velec            = _mm256_sub_ps(rinv33,pmecorrV);
819             velec            = _mm256_mul_ps(qq33,velec);
820             
821             d                = _mm256_sub_ps(r33,rswitch);
822             d                = _mm256_max_ps(d,_mm256_setzero_ps());
823             d2               = _mm256_mul_ps(d,d);
824             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
825
826             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
827
828             /* Evaluate switch function */
829             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
830             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
831             velec            = _mm256_mul_ps(velec,sw);
832             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
833
834             /* Update potential sum for this i atom from the interaction with this j atom. */
835             velec            = _mm256_and_ps(velec,cutoff_mask);
836             velecsum         = _mm256_add_ps(velecsum,velec);
837
838             fscal            = felec;
839
840             fscal            = _mm256_and_ps(fscal,cutoff_mask);
841
842             /* Calculate temporary vectorial force */
843             tx               = _mm256_mul_ps(fscal,dx33);
844             ty               = _mm256_mul_ps(fscal,dy33);
845             tz               = _mm256_mul_ps(fscal,dz33);
846
847             /* Update vectorial force */
848             fix3             = _mm256_add_ps(fix3,tx);
849             fiy3             = _mm256_add_ps(fiy3,ty);
850             fiz3             = _mm256_add_ps(fiz3,tz);
851
852             fjx3             = _mm256_add_ps(fjx3,tx);
853             fjy3             = _mm256_add_ps(fjy3,ty);
854             fjz3             = _mm256_add_ps(fjz3,tz);
855
856             }
857
858             fjptrA             = f+j_coord_offsetA;
859             fjptrB             = f+j_coord_offsetB;
860             fjptrC             = f+j_coord_offsetC;
861             fjptrD             = f+j_coord_offsetD;
862             fjptrE             = f+j_coord_offsetE;
863             fjptrF             = f+j_coord_offsetF;
864             fjptrG             = f+j_coord_offsetG;
865             fjptrH             = f+j_coord_offsetH;
866
867             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
868                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
869                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
870
871             /* Inner loop uses 972 flops */
872         }
873
874         if(jidx<j_index_end)
875         {
876
877             /* Get j neighbor index, and coordinate index */
878             jnrlistA         = jjnr[jidx];
879             jnrlistB         = jjnr[jidx+1];
880             jnrlistC         = jjnr[jidx+2];
881             jnrlistD         = jjnr[jidx+3];
882             jnrlistE         = jjnr[jidx+4];
883             jnrlistF         = jjnr[jidx+5];
884             jnrlistG         = jjnr[jidx+6];
885             jnrlistH         = jjnr[jidx+7];
886             /* Sign of each element will be negative for non-real atoms.
887              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
888              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
889              */
890             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
891                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
892                                             
893             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
894             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
895             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
896             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
897             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
898             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
899             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
900             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
901             j_coord_offsetA  = DIM*jnrA;
902             j_coord_offsetB  = DIM*jnrB;
903             j_coord_offsetC  = DIM*jnrC;
904             j_coord_offsetD  = DIM*jnrD;
905             j_coord_offsetE  = DIM*jnrE;
906             j_coord_offsetF  = DIM*jnrF;
907             j_coord_offsetG  = DIM*jnrG;
908             j_coord_offsetH  = DIM*jnrH;
909
910             /* load j atom coordinates */
911             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
912                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
913                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
914                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
915                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
916
917             /* Calculate displacement vector */
918             dx11             = _mm256_sub_ps(ix1,jx1);
919             dy11             = _mm256_sub_ps(iy1,jy1);
920             dz11             = _mm256_sub_ps(iz1,jz1);
921             dx12             = _mm256_sub_ps(ix1,jx2);
922             dy12             = _mm256_sub_ps(iy1,jy2);
923             dz12             = _mm256_sub_ps(iz1,jz2);
924             dx13             = _mm256_sub_ps(ix1,jx3);
925             dy13             = _mm256_sub_ps(iy1,jy3);
926             dz13             = _mm256_sub_ps(iz1,jz3);
927             dx21             = _mm256_sub_ps(ix2,jx1);
928             dy21             = _mm256_sub_ps(iy2,jy1);
929             dz21             = _mm256_sub_ps(iz2,jz1);
930             dx22             = _mm256_sub_ps(ix2,jx2);
931             dy22             = _mm256_sub_ps(iy2,jy2);
932             dz22             = _mm256_sub_ps(iz2,jz2);
933             dx23             = _mm256_sub_ps(ix2,jx3);
934             dy23             = _mm256_sub_ps(iy2,jy3);
935             dz23             = _mm256_sub_ps(iz2,jz3);
936             dx31             = _mm256_sub_ps(ix3,jx1);
937             dy31             = _mm256_sub_ps(iy3,jy1);
938             dz31             = _mm256_sub_ps(iz3,jz1);
939             dx32             = _mm256_sub_ps(ix3,jx2);
940             dy32             = _mm256_sub_ps(iy3,jy2);
941             dz32             = _mm256_sub_ps(iz3,jz2);
942             dx33             = _mm256_sub_ps(ix3,jx3);
943             dy33             = _mm256_sub_ps(iy3,jy3);
944             dz33             = _mm256_sub_ps(iz3,jz3);
945
946             /* Calculate squared distance and things based on it */
947             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
948             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
949             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
950             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
951             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
952             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
953             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
954             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
955             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
956
957             rinv11           = avx256_invsqrt_f(rsq11);
958             rinv12           = avx256_invsqrt_f(rsq12);
959             rinv13           = avx256_invsqrt_f(rsq13);
960             rinv21           = avx256_invsqrt_f(rsq21);
961             rinv22           = avx256_invsqrt_f(rsq22);
962             rinv23           = avx256_invsqrt_f(rsq23);
963             rinv31           = avx256_invsqrt_f(rsq31);
964             rinv32           = avx256_invsqrt_f(rsq32);
965             rinv33           = avx256_invsqrt_f(rsq33);
966
967             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
968             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
969             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
970             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
971             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
972             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
973             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
974             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
975             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
976
977             fjx1             = _mm256_setzero_ps();
978             fjy1             = _mm256_setzero_ps();
979             fjz1             = _mm256_setzero_ps();
980             fjx2             = _mm256_setzero_ps();
981             fjy2             = _mm256_setzero_ps();
982             fjz2             = _mm256_setzero_ps();
983             fjx3             = _mm256_setzero_ps();
984             fjy3             = _mm256_setzero_ps();
985             fjz3             = _mm256_setzero_ps();
986
987             /**************************
988              * CALCULATE INTERACTIONS *
989              **************************/
990
991             if (gmx_mm256_any_lt(rsq11,rcutoff2))
992             {
993
994             r11              = _mm256_mul_ps(rsq11,rinv11);
995             r11              = _mm256_andnot_ps(dummy_mask,r11);
996
997             /* EWALD ELECTROSTATICS */
998             
999             /* Analytical PME correction */
1000             zeta2            = _mm256_mul_ps(beta2,rsq11);
1001             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1002             pmecorrF         = avx256_pmecorrF_f(zeta2);
1003             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1004             felec            = _mm256_mul_ps(qq11,felec);
1005             pmecorrV         = avx256_pmecorrV_f(zeta2);
1006             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1007             velec            = _mm256_sub_ps(rinv11,pmecorrV);
1008             velec            = _mm256_mul_ps(qq11,velec);
1009             
1010             d                = _mm256_sub_ps(r11,rswitch);
1011             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1012             d2               = _mm256_mul_ps(d,d);
1013             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1014
1015             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1016
1017             /* Evaluate switch function */
1018             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1019             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1020             velec            = _mm256_mul_ps(velec,sw);
1021             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1022
1023             /* Update potential sum for this i atom from the interaction with this j atom. */
1024             velec            = _mm256_and_ps(velec,cutoff_mask);
1025             velec            = _mm256_andnot_ps(dummy_mask,velec);
1026             velecsum         = _mm256_add_ps(velecsum,velec);
1027
1028             fscal            = felec;
1029
1030             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1031
1032             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1033
1034             /* Calculate temporary vectorial force */
1035             tx               = _mm256_mul_ps(fscal,dx11);
1036             ty               = _mm256_mul_ps(fscal,dy11);
1037             tz               = _mm256_mul_ps(fscal,dz11);
1038
1039             /* Update vectorial force */
1040             fix1             = _mm256_add_ps(fix1,tx);
1041             fiy1             = _mm256_add_ps(fiy1,ty);
1042             fiz1             = _mm256_add_ps(fiz1,tz);
1043
1044             fjx1             = _mm256_add_ps(fjx1,tx);
1045             fjy1             = _mm256_add_ps(fjy1,ty);
1046             fjz1             = _mm256_add_ps(fjz1,tz);
1047
1048             }
1049
1050             /**************************
1051              * CALCULATE INTERACTIONS *
1052              **************************/
1053
1054             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1055             {
1056
1057             r12              = _mm256_mul_ps(rsq12,rinv12);
1058             r12              = _mm256_andnot_ps(dummy_mask,r12);
1059
1060             /* EWALD ELECTROSTATICS */
1061             
1062             /* Analytical PME correction */
1063             zeta2            = _mm256_mul_ps(beta2,rsq12);
1064             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1065             pmecorrF         = avx256_pmecorrF_f(zeta2);
1066             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1067             felec            = _mm256_mul_ps(qq12,felec);
1068             pmecorrV         = avx256_pmecorrV_f(zeta2);
1069             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1070             velec            = _mm256_sub_ps(rinv12,pmecorrV);
1071             velec            = _mm256_mul_ps(qq12,velec);
1072             
1073             d                = _mm256_sub_ps(r12,rswitch);
1074             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1075             d2               = _mm256_mul_ps(d,d);
1076             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1077
1078             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1079
1080             /* Evaluate switch function */
1081             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1082             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1083             velec            = _mm256_mul_ps(velec,sw);
1084             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1085
1086             /* Update potential sum for this i atom from the interaction with this j atom. */
1087             velec            = _mm256_and_ps(velec,cutoff_mask);
1088             velec            = _mm256_andnot_ps(dummy_mask,velec);
1089             velecsum         = _mm256_add_ps(velecsum,velec);
1090
1091             fscal            = felec;
1092
1093             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1094
1095             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1096
1097             /* Calculate temporary vectorial force */
1098             tx               = _mm256_mul_ps(fscal,dx12);
1099             ty               = _mm256_mul_ps(fscal,dy12);
1100             tz               = _mm256_mul_ps(fscal,dz12);
1101
1102             /* Update vectorial force */
1103             fix1             = _mm256_add_ps(fix1,tx);
1104             fiy1             = _mm256_add_ps(fiy1,ty);
1105             fiz1             = _mm256_add_ps(fiz1,tz);
1106
1107             fjx2             = _mm256_add_ps(fjx2,tx);
1108             fjy2             = _mm256_add_ps(fjy2,ty);
1109             fjz2             = _mm256_add_ps(fjz2,tz);
1110
1111             }
1112
1113             /**************************
1114              * CALCULATE INTERACTIONS *
1115              **************************/
1116
1117             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1118             {
1119
1120             r13              = _mm256_mul_ps(rsq13,rinv13);
1121             r13              = _mm256_andnot_ps(dummy_mask,r13);
1122
1123             /* EWALD ELECTROSTATICS */
1124             
1125             /* Analytical PME correction */
1126             zeta2            = _mm256_mul_ps(beta2,rsq13);
1127             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
1128             pmecorrF         = avx256_pmecorrF_f(zeta2);
1129             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1130             felec            = _mm256_mul_ps(qq13,felec);
1131             pmecorrV         = avx256_pmecorrV_f(zeta2);
1132             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1133             velec            = _mm256_sub_ps(rinv13,pmecorrV);
1134             velec            = _mm256_mul_ps(qq13,velec);
1135             
1136             d                = _mm256_sub_ps(r13,rswitch);
1137             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1138             d2               = _mm256_mul_ps(d,d);
1139             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1140
1141             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1142
1143             /* Evaluate switch function */
1144             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1145             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
1146             velec            = _mm256_mul_ps(velec,sw);
1147             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1148
1149             /* Update potential sum for this i atom from the interaction with this j atom. */
1150             velec            = _mm256_and_ps(velec,cutoff_mask);
1151             velec            = _mm256_andnot_ps(dummy_mask,velec);
1152             velecsum         = _mm256_add_ps(velecsum,velec);
1153
1154             fscal            = felec;
1155
1156             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1157
1158             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1159
1160             /* Calculate temporary vectorial force */
1161             tx               = _mm256_mul_ps(fscal,dx13);
1162             ty               = _mm256_mul_ps(fscal,dy13);
1163             tz               = _mm256_mul_ps(fscal,dz13);
1164
1165             /* Update vectorial force */
1166             fix1             = _mm256_add_ps(fix1,tx);
1167             fiy1             = _mm256_add_ps(fiy1,ty);
1168             fiz1             = _mm256_add_ps(fiz1,tz);
1169
1170             fjx3             = _mm256_add_ps(fjx3,tx);
1171             fjy3             = _mm256_add_ps(fjy3,ty);
1172             fjz3             = _mm256_add_ps(fjz3,tz);
1173
1174             }
1175
1176             /**************************
1177              * CALCULATE INTERACTIONS *
1178              **************************/
1179
1180             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1181             {
1182
1183             r21              = _mm256_mul_ps(rsq21,rinv21);
1184             r21              = _mm256_andnot_ps(dummy_mask,r21);
1185
1186             /* EWALD ELECTROSTATICS */
1187             
1188             /* Analytical PME correction */
1189             zeta2            = _mm256_mul_ps(beta2,rsq21);
1190             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1191             pmecorrF         = avx256_pmecorrF_f(zeta2);
1192             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1193             felec            = _mm256_mul_ps(qq21,felec);
1194             pmecorrV         = avx256_pmecorrV_f(zeta2);
1195             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1196             velec            = _mm256_sub_ps(rinv21,pmecorrV);
1197             velec            = _mm256_mul_ps(qq21,velec);
1198             
1199             d                = _mm256_sub_ps(r21,rswitch);
1200             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1201             d2               = _mm256_mul_ps(d,d);
1202             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1203
1204             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1205
1206             /* Evaluate switch function */
1207             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1208             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1209             velec            = _mm256_mul_ps(velec,sw);
1210             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1211
1212             /* Update potential sum for this i atom from the interaction with this j atom. */
1213             velec            = _mm256_and_ps(velec,cutoff_mask);
1214             velec            = _mm256_andnot_ps(dummy_mask,velec);
1215             velecsum         = _mm256_add_ps(velecsum,velec);
1216
1217             fscal            = felec;
1218
1219             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1220
1221             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1222
1223             /* Calculate temporary vectorial force */
1224             tx               = _mm256_mul_ps(fscal,dx21);
1225             ty               = _mm256_mul_ps(fscal,dy21);
1226             tz               = _mm256_mul_ps(fscal,dz21);
1227
1228             /* Update vectorial force */
1229             fix2             = _mm256_add_ps(fix2,tx);
1230             fiy2             = _mm256_add_ps(fiy2,ty);
1231             fiz2             = _mm256_add_ps(fiz2,tz);
1232
1233             fjx1             = _mm256_add_ps(fjx1,tx);
1234             fjy1             = _mm256_add_ps(fjy1,ty);
1235             fjz1             = _mm256_add_ps(fjz1,tz);
1236
1237             }
1238
1239             /**************************
1240              * CALCULATE INTERACTIONS *
1241              **************************/
1242
1243             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1244             {
1245
1246             r22              = _mm256_mul_ps(rsq22,rinv22);
1247             r22              = _mm256_andnot_ps(dummy_mask,r22);
1248
1249             /* EWALD ELECTROSTATICS */
1250             
1251             /* Analytical PME correction */
1252             zeta2            = _mm256_mul_ps(beta2,rsq22);
1253             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1254             pmecorrF         = avx256_pmecorrF_f(zeta2);
1255             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1256             felec            = _mm256_mul_ps(qq22,felec);
1257             pmecorrV         = avx256_pmecorrV_f(zeta2);
1258             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1259             velec            = _mm256_sub_ps(rinv22,pmecorrV);
1260             velec            = _mm256_mul_ps(qq22,velec);
1261             
1262             d                = _mm256_sub_ps(r22,rswitch);
1263             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1264             d2               = _mm256_mul_ps(d,d);
1265             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1266
1267             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1268
1269             /* Evaluate switch function */
1270             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1271             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1272             velec            = _mm256_mul_ps(velec,sw);
1273             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1274
1275             /* Update potential sum for this i atom from the interaction with this j atom. */
1276             velec            = _mm256_and_ps(velec,cutoff_mask);
1277             velec            = _mm256_andnot_ps(dummy_mask,velec);
1278             velecsum         = _mm256_add_ps(velecsum,velec);
1279
1280             fscal            = felec;
1281
1282             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1283
1284             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1285
1286             /* Calculate temporary vectorial force */
1287             tx               = _mm256_mul_ps(fscal,dx22);
1288             ty               = _mm256_mul_ps(fscal,dy22);
1289             tz               = _mm256_mul_ps(fscal,dz22);
1290
1291             /* Update vectorial force */
1292             fix2             = _mm256_add_ps(fix2,tx);
1293             fiy2             = _mm256_add_ps(fiy2,ty);
1294             fiz2             = _mm256_add_ps(fiz2,tz);
1295
1296             fjx2             = _mm256_add_ps(fjx2,tx);
1297             fjy2             = _mm256_add_ps(fjy2,ty);
1298             fjz2             = _mm256_add_ps(fjz2,tz);
1299
1300             }
1301
1302             /**************************
1303              * CALCULATE INTERACTIONS *
1304              **************************/
1305
1306             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1307             {
1308
1309             r23              = _mm256_mul_ps(rsq23,rinv23);
1310             r23              = _mm256_andnot_ps(dummy_mask,r23);
1311
1312             /* EWALD ELECTROSTATICS */
1313             
1314             /* Analytical PME correction */
1315             zeta2            = _mm256_mul_ps(beta2,rsq23);
1316             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
1317             pmecorrF         = avx256_pmecorrF_f(zeta2);
1318             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1319             felec            = _mm256_mul_ps(qq23,felec);
1320             pmecorrV         = avx256_pmecorrV_f(zeta2);
1321             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1322             velec            = _mm256_sub_ps(rinv23,pmecorrV);
1323             velec            = _mm256_mul_ps(qq23,velec);
1324             
1325             d                = _mm256_sub_ps(r23,rswitch);
1326             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1327             d2               = _mm256_mul_ps(d,d);
1328             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1329
1330             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1331
1332             /* Evaluate switch function */
1333             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1334             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
1335             velec            = _mm256_mul_ps(velec,sw);
1336             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1337
1338             /* Update potential sum for this i atom from the interaction with this j atom. */
1339             velec            = _mm256_and_ps(velec,cutoff_mask);
1340             velec            = _mm256_andnot_ps(dummy_mask,velec);
1341             velecsum         = _mm256_add_ps(velecsum,velec);
1342
1343             fscal            = felec;
1344
1345             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1346
1347             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1348
1349             /* Calculate temporary vectorial force */
1350             tx               = _mm256_mul_ps(fscal,dx23);
1351             ty               = _mm256_mul_ps(fscal,dy23);
1352             tz               = _mm256_mul_ps(fscal,dz23);
1353
1354             /* Update vectorial force */
1355             fix2             = _mm256_add_ps(fix2,tx);
1356             fiy2             = _mm256_add_ps(fiy2,ty);
1357             fiz2             = _mm256_add_ps(fiz2,tz);
1358
1359             fjx3             = _mm256_add_ps(fjx3,tx);
1360             fjy3             = _mm256_add_ps(fjy3,ty);
1361             fjz3             = _mm256_add_ps(fjz3,tz);
1362
1363             }
1364
1365             /**************************
1366              * CALCULATE INTERACTIONS *
1367              **************************/
1368
1369             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1370             {
1371
1372             r31              = _mm256_mul_ps(rsq31,rinv31);
1373             r31              = _mm256_andnot_ps(dummy_mask,r31);
1374
1375             /* EWALD ELECTROSTATICS */
1376             
1377             /* Analytical PME correction */
1378             zeta2            = _mm256_mul_ps(beta2,rsq31);
1379             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
1380             pmecorrF         = avx256_pmecorrF_f(zeta2);
1381             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1382             felec            = _mm256_mul_ps(qq31,felec);
1383             pmecorrV         = avx256_pmecorrV_f(zeta2);
1384             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1385             velec            = _mm256_sub_ps(rinv31,pmecorrV);
1386             velec            = _mm256_mul_ps(qq31,velec);
1387             
1388             d                = _mm256_sub_ps(r31,rswitch);
1389             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1390             d2               = _mm256_mul_ps(d,d);
1391             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1392
1393             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1394
1395             /* Evaluate switch function */
1396             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1397             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
1398             velec            = _mm256_mul_ps(velec,sw);
1399             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1400
1401             /* Update potential sum for this i atom from the interaction with this j atom. */
1402             velec            = _mm256_and_ps(velec,cutoff_mask);
1403             velec            = _mm256_andnot_ps(dummy_mask,velec);
1404             velecsum         = _mm256_add_ps(velecsum,velec);
1405
1406             fscal            = felec;
1407
1408             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1409
1410             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1411
1412             /* Calculate temporary vectorial force */
1413             tx               = _mm256_mul_ps(fscal,dx31);
1414             ty               = _mm256_mul_ps(fscal,dy31);
1415             tz               = _mm256_mul_ps(fscal,dz31);
1416
1417             /* Update vectorial force */
1418             fix3             = _mm256_add_ps(fix3,tx);
1419             fiy3             = _mm256_add_ps(fiy3,ty);
1420             fiz3             = _mm256_add_ps(fiz3,tz);
1421
1422             fjx1             = _mm256_add_ps(fjx1,tx);
1423             fjy1             = _mm256_add_ps(fjy1,ty);
1424             fjz1             = _mm256_add_ps(fjz1,tz);
1425
1426             }
1427
1428             /**************************
1429              * CALCULATE INTERACTIONS *
1430              **************************/
1431
1432             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1433             {
1434
1435             r32              = _mm256_mul_ps(rsq32,rinv32);
1436             r32              = _mm256_andnot_ps(dummy_mask,r32);
1437
1438             /* EWALD ELECTROSTATICS */
1439             
1440             /* Analytical PME correction */
1441             zeta2            = _mm256_mul_ps(beta2,rsq32);
1442             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
1443             pmecorrF         = avx256_pmecorrF_f(zeta2);
1444             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1445             felec            = _mm256_mul_ps(qq32,felec);
1446             pmecorrV         = avx256_pmecorrV_f(zeta2);
1447             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1448             velec            = _mm256_sub_ps(rinv32,pmecorrV);
1449             velec            = _mm256_mul_ps(qq32,velec);
1450             
1451             d                = _mm256_sub_ps(r32,rswitch);
1452             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1453             d2               = _mm256_mul_ps(d,d);
1454             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1455
1456             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1457
1458             /* Evaluate switch function */
1459             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1460             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
1461             velec            = _mm256_mul_ps(velec,sw);
1462             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1463
1464             /* Update potential sum for this i atom from the interaction with this j atom. */
1465             velec            = _mm256_and_ps(velec,cutoff_mask);
1466             velec            = _mm256_andnot_ps(dummy_mask,velec);
1467             velecsum         = _mm256_add_ps(velecsum,velec);
1468
1469             fscal            = felec;
1470
1471             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1472
1473             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1474
1475             /* Calculate temporary vectorial force */
1476             tx               = _mm256_mul_ps(fscal,dx32);
1477             ty               = _mm256_mul_ps(fscal,dy32);
1478             tz               = _mm256_mul_ps(fscal,dz32);
1479
1480             /* Update vectorial force */
1481             fix3             = _mm256_add_ps(fix3,tx);
1482             fiy3             = _mm256_add_ps(fiy3,ty);
1483             fiz3             = _mm256_add_ps(fiz3,tz);
1484
1485             fjx2             = _mm256_add_ps(fjx2,tx);
1486             fjy2             = _mm256_add_ps(fjy2,ty);
1487             fjz2             = _mm256_add_ps(fjz2,tz);
1488
1489             }
1490
1491             /**************************
1492              * CALCULATE INTERACTIONS *
1493              **************************/
1494
1495             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1496             {
1497
1498             r33              = _mm256_mul_ps(rsq33,rinv33);
1499             r33              = _mm256_andnot_ps(dummy_mask,r33);
1500
1501             /* EWALD ELECTROSTATICS */
1502             
1503             /* Analytical PME correction */
1504             zeta2            = _mm256_mul_ps(beta2,rsq33);
1505             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
1506             pmecorrF         = avx256_pmecorrF_f(zeta2);
1507             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1508             felec            = _mm256_mul_ps(qq33,felec);
1509             pmecorrV         = avx256_pmecorrV_f(zeta2);
1510             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1511             velec            = _mm256_sub_ps(rinv33,pmecorrV);
1512             velec            = _mm256_mul_ps(qq33,velec);
1513             
1514             d                = _mm256_sub_ps(r33,rswitch);
1515             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1516             d2               = _mm256_mul_ps(d,d);
1517             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1518
1519             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1520
1521             /* Evaluate switch function */
1522             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1523             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
1524             velec            = _mm256_mul_ps(velec,sw);
1525             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1526
1527             /* Update potential sum for this i atom from the interaction with this j atom. */
1528             velec            = _mm256_and_ps(velec,cutoff_mask);
1529             velec            = _mm256_andnot_ps(dummy_mask,velec);
1530             velecsum         = _mm256_add_ps(velecsum,velec);
1531
1532             fscal            = felec;
1533
1534             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1535
1536             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1537
1538             /* Calculate temporary vectorial force */
1539             tx               = _mm256_mul_ps(fscal,dx33);
1540             ty               = _mm256_mul_ps(fscal,dy33);
1541             tz               = _mm256_mul_ps(fscal,dz33);
1542
1543             /* Update vectorial force */
1544             fix3             = _mm256_add_ps(fix3,tx);
1545             fiy3             = _mm256_add_ps(fiy3,ty);
1546             fiz3             = _mm256_add_ps(fiz3,tz);
1547
1548             fjx3             = _mm256_add_ps(fjx3,tx);
1549             fjy3             = _mm256_add_ps(fjy3,ty);
1550             fjz3             = _mm256_add_ps(fjz3,tz);
1551
1552             }
1553
1554             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1555             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1556             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1557             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1558             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1559             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1560             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1561             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1562
1563             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1564                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1565                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1566
1567             /* Inner loop uses 981 flops */
1568         }
1569
1570         /* End of innermost loop */
1571
1572         gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1573                                                  f+i_coord_offset+DIM,fshift+i_shift_offset);
1574
1575         ggid                        = gid[iidx];
1576         /* Update potential energies */
1577         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1578
1579         /* Increment number of inner iterations */
1580         inneriter                  += j_index_end - j_index_start;
1581
1582         /* Outer loop uses 19 flops */
1583     }
1584
1585     /* Increment number of outer iterations */
1586     outeriter        += nri;
1587
1588     /* Update outer/inner flops */
1589
1590     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*981);
1591 }
1592 /*
1593  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single
1594  * Electrostatics interaction: Ewald
1595  * VdW interaction:            None
1596  * Geometry:                   Water4-Water4
1597  * Calculate force/pot:        Force
1598  */
1599 void
1600 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single
1601                     (t_nblist                    * gmx_restrict       nlist,
1602                      rvec                        * gmx_restrict          xx,
1603                      rvec                        * gmx_restrict          ff,
1604                      struct t_forcerec           * gmx_restrict          fr,
1605                      t_mdatoms                   * gmx_restrict     mdatoms,
1606                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1607                      t_nrnb                      * gmx_restrict        nrnb)
1608 {
1609     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1610      * just 0 for non-waters.
1611      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1612      * jnr indices corresponding to data put in the four positions in the SIMD register.
1613      */
1614     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1615     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1616     int              jnrA,jnrB,jnrC,jnrD;
1617     int              jnrE,jnrF,jnrG,jnrH;
1618     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1619     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1620     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1621     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1622     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1623     real             rcutoff_scalar;
1624     real             *shiftvec,*fshift,*x,*f;
1625     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1626     real             scratch[4*DIM];
1627     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1628     real *           vdwioffsetptr1;
1629     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1630     real *           vdwioffsetptr2;
1631     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1632     real *           vdwioffsetptr3;
1633     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1634     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1635     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1636     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1637     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1638     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1639     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1640     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1641     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1642     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1643     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1644     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1645     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1646     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1647     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1648     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1649     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1650     real             *charge;
1651     __m256i          ewitab;
1652     __m128i          ewitab_lo,ewitab_hi;
1653     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1654     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1655     real             *ewtab;
1656     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1657     real             rswitch_scalar,d_scalar;
1658     __m256           dummy_mask,cutoff_mask;
1659     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1660     __m256           one     = _mm256_set1_ps(1.0);
1661     __m256           two     = _mm256_set1_ps(2.0);
1662     x                = xx[0];
1663     f                = ff[0];
1664
1665     nri              = nlist->nri;
1666     iinr             = nlist->iinr;
1667     jindex           = nlist->jindex;
1668     jjnr             = nlist->jjnr;
1669     shiftidx         = nlist->shift;
1670     gid              = nlist->gid;
1671     shiftvec         = fr->shift_vec[0];
1672     fshift           = fr->fshift[0];
1673     facel            = _mm256_set1_ps(fr->ic->epsfac);
1674     charge           = mdatoms->chargeA;
1675
1676     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1677     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1678     beta2            = _mm256_mul_ps(beta,beta);
1679     beta3            = _mm256_mul_ps(beta,beta2);
1680
1681     ewtab            = fr->ic->tabq_coul_FDV0;
1682     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1683     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1684
1685     /* Setup water-specific parameters */
1686     inr              = nlist->iinr[0];
1687     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1688     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1689     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1690
1691     jq1              = _mm256_set1_ps(charge[inr+1]);
1692     jq2              = _mm256_set1_ps(charge[inr+2]);
1693     jq3              = _mm256_set1_ps(charge[inr+3]);
1694     qq11             = _mm256_mul_ps(iq1,jq1);
1695     qq12             = _mm256_mul_ps(iq1,jq2);
1696     qq13             = _mm256_mul_ps(iq1,jq3);
1697     qq21             = _mm256_mul_ps(iq2,jq1);
1698     qq22             = _mm256_mul_ps(iq2,jq2);
1699     qq23             = _mm256_mul_ps(iq2,jq3);
1700     qq31             = _mm256_mul_ps(iq3,jq1);
1701     qq32             = _mm256_mul_ps(iq3,jq2);
1702     qq33             = _mm256_mul_ps(iq3,jq3);
1703
1704     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1705     rcutoff_scalar   = fr->ic->rcoulomb;
1706     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1707     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1708
1709     rswitch_scalar   = fr->ic->rcoulomb_switch;
1710     rswitch          = _mm256_set1_ps(rswitch_scalar);
1711     /* Setup switch parameters */
1712     d_scalar         = rcutoff_scalar-rswitch_scalar;
1713     d                = _mm256_set1_ps(d_scalar);
1714     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1715     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1716     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1717     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1718     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1719     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1720
1721     /* Avoid stupid compiler warnings */
1722     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1723     j_coord_offsetA = 0;
1724     j_coord_offsetB = 0;
1725     j_coord_offsetC = 0;
1726     j_coord_offsetD = 0;
1727     j_coord_offsetE = 0;
1728     j_coord_offsetF = 0;
1729     j_coord_offsetG = 0;
1730     j_coord_offsetH = 0;
1731
1732     outeriter        = 0;
1733     inneriter        = 0;
1734
1735     for(iidx=0;iidx<4*DIM;iidx++)
1736     {
1737         scratch[iidx] = 0.0;
1738     }
1739
1740     /* Start outer loop over neighborlists */
1741     for(iidx=0; iidx<nri; iidx++)
1742     {
1743         /* Load shift vector for this list */
1744         i_shift_offset   = DIM*shiftidx[iidx];
1745
1746         /* Load limits for loop over neighbors */
1747         j_index_start    = jindex[iidx];
1748         j_index_end      = jindex[iidx+1];
1749
1750         /* Get outer coordinate index */
1751         inr              = iinr[iidx];
1752         i_coord_offset   = DIM*inr;
1753
1754         /* Load i particle coords and add shift vector */
1755         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1756                                                     &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1757
1758         fix1             = _mm256_setzero_ps();
1759         fiy1             = _mm256_setzero_ps();
1760         fiz1             = _mm256_setzero_ps();
1761         fix2             = _mm256_setzero_ps();
1762         fiy2             = _mm256_setzero_ps();
1763         fiz2             = _mm256_setzero_ps();
1764         fix3             = _mm256_setzero_ps();
1765         fiy3             = _mm256_setzero_ps();
1766         fiz3             = _mm256_setzero_ps();
1767
1768         /* Start inner kernel loop */
1769         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1770         {
1771
1772             /* Get j neighbor index, and coordinate index */
1773             jnrA             = jjnr[jidx];
1774             jnrB             = jjnr[jidx+1];
1775             jnrC             = jjnr[jidx+2];
1776             jnrD             = jjnr[jidx+3];
1777             jnrE             = jjnr[jidx+4];
1778             jnrF             = jjnr[jidx+5];
1779             jnrG             = jjnr[jidx+6];
1780             jnrH             = jjnr[jidx+7];
1781             j_coord_offsetA  = DIM*jnrA;
1782             j_coord_offsetB  = DIM*jnrB;
1783             j_coord_offsetC  = DIM*jnrC;
1784             j_coord_offsetD  = DIM*jnrD;
1785             j_coord_offsetE  = DIM*jnrE;
1786             j_coord_offsetF  = DIM*jnrF;
1787             j_coord_offsetG  = DIM*jnrG;
1788             j_coord_offsetH  = DIM*jnrH;
1789
1790             /* load j atom coordinates */
1791             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1792                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1793                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1794                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1795                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1796
1797             /* Calculate displacement vector */
1798             dx11             = _mm256_sub_ps(ix1,jx1);
1799             dy11             = _mm256_sub_ps(iy1,jy1);
1800             dz11             = _mm256_sub_ps(iz1,jz1);
1801             dx12             = _mm256_sub_ps(ix1,jx2);
1802             dy12             = _mm256_sub_ps(iy1,jy2);
1803             dz12             = _mm256_sub_ps(iz1,jz2);
1804             dx13             = _mm256_sub_ps(ix1,jx3);
1805             dy13             = _mm256_sub_ps(iy1,jy3);
1806             dz13             = _mm256_sub_ps(iz1,jz3);
1807             dx21             = _mm256_sub_ps(ix2,jx1);
1808             dy21             = _mm256_sub_ps(iy2,jy1);
1809             dz21             = _mm256_sub_ps(iz2,jz1);
1810             dx22             = _mm256_sub_ps(ix2,jx2);
1811             dy22             = _mm256_sub_ps(iy2,jy2);
1812             dz22             = _mm256_sub_ps(iz2,jz2);
1813             dx23             = _mm256_sub_ps(ix2,jx3);
1814             dy23             = _mm256_sub_ps(iy2,jy3);
1815             dz23             = _mm256_sub_ps(iz2,jz3);
1816             dx31             = _mm256_sub_ps(ix3,jx1);
1817             dy31             = _mm256_sub_ps(iy3,jy1);
1818             dz31             = _mm256_sub_ps(iz3,jz1);
1819             dx32             = _mm256_sub_ps(ix3,jx2);
1820             dy32             = _mm256_sub_ps(iy3,jy2);
1821             dz32             = _mm256_sub_ps(iz3,jz2);
1822             dx33             = _mm256_sub_ps(ix3,jx3);
1823             dy33             = _mm256_sub_ps(iy3,jy3);
1824             dz33             = _mm256_sub_ps(iz3,jz3);
1825
1826             /* Calculate squared distance and things based on it */
1827             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1828             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1829             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1830             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1831             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1832             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1833             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1834             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1835             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1836
1837             rinv11           = avx256_invsqrt_f(rsq11);
1838             rinv12           = avx256_invsqrt_f(rsq12);
1839             rinv13           = avx256_invsqrt_f(rsq13);
1840             rinv21           = avx256_invsqrt_f(rsq21);
1841             rinv22           = avx256_invsqrt_f(rsq22);
1842             rinv23           = avx256_invsqrt_f(rsq23);
1843             rinv31           = avx256_invsqrt_f(rsq31);
1844             rinv32           = avx256_invsqrt_f(rsq32);
1845             rinv33           = avx256_invsqrt_f(rsq33);
1846
1847             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1848             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1849             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1850             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1851             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1852             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1853             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1854             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1855             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1856
1857             fjx1             = _mm256_setzero_ps();
1858             fjy1             = _mm256_setzero_ps();
1859             fjz1             = _mm256_setzero_ps();
1860             fjx2             = _mm256_setzero_ps();
1861             fjy2             = _mm256_setzero_ps();
1862             fjz2             = _mm256_setzero_ps();
1863             fjx3             = _mm256_setzero_ps();
1864             fjy3             = _mm256_setzero_ps();
1865             fjz3             = _mm256_setzero_ps();
1866
1867             /**************************
1868              * CALCULATE INTERACTIONS *
1869              **************************/
1870
1871             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1872             {
1873
1874             r11              = _mm256_mul_ps(rsq11,rinv11);
1875
1876             /* EWALD ELECTROSTATICS */
1877             
1878             /* Analytical PME correction */
1879             zeta2            = _mm256_mul_ps(beta2,rsq11);
1880             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1881             pmecorrF         = avx256_pmecorrF_f(zeta2);
1882             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1883             felec            = _mm256_mul_ps(qq11,felec);
1884             pmecorrV         = avx256_pmecorrV_f(zeta2);
1885             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1886             velec            = _mm256_sub_ps(rinv11,pmecorrV);
1887             velec            = _mm256_mul_ps(qq11,velec);
1888             
1889             d                = _mm256_sub_ps(r11,rswitch);
1890             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1891             d2               = _mm256_mul_ps(d,d);
1892             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1893
1894             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1895
1896             /* Evaluate switch function */
1897             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1898             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1899             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1900
1901             fscal            = felec;
1902
1903             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1904
1905             /* Calculate temporary vectorial force */
1906             tx               = _mm256_mul_ps(fscal,dx11);
1907             ty               = _mm256_mul_ps(fscal,dy11);
1908             tz               = _mm256_mul_ps(fscal,dz11);
1909
1910             /* Update vectorial force */
1911             fix1             = _mm256_add_ps(fix1,tx);
1912             fiy1             = _mm256_add_ps(fiy1,ty);
1913             fiz1             = _mm256_add_ps(fiz1,tz);
1914
1915             fjx1             = _mm256_add_ps(fjx1,tx);
1916             fjy1             = _mm256_add_ps(fjy1,ty);
1917             fjz1             = _mm256_add_ps(fjz1,tz);
1918
1919             }
1920
1921             /**************************
1922              * CALCULATE INTERACTIONS *
1923              **************************/
1924
1925             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1926             {
1927
1928             r12              = _mm256_mul_ps(rsq12,rinv12);
1929
1930             /* EWALD ELECTROSTATICS */
1931             
1932             /* Analytical PME correction */
1933             zeta2            = _mm256_mul_ps(beta2,rsq12);
1934             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1935             pmecorrF         = avx256_pmecorrF_f(zeta2);
1936             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1937             felec            = _mm256_mul_ps(qq12,felec);
1938             pmecorrV         = avx256_pmecorrV_f(zeta2);
1939             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1940             velec            = _mm256_sub_ps(rinv12,pmecorrV);
1941             velec            = _mm256_mul_ps(qq12,velec);
1942             
1943             d                = _mm256_sub_ps(r12,rswitch);
1944             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1945             d2               = _mm256_mul_ps(d,d);
1946             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1947
1948             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1949
1950             /* Evaluate switch function */
1951             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1952             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1953             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1954
1955             fscal            = felec;
1956
1957             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1958
1959             /* Calculate temporary vectorial force */
1960             tx               = _mm256_mul_ps(fscal,dx12);
1961             ty               = _mm256_mul_ps(fscal,dy12);
1962             tz               = _mm256_mul_ps(fscal,dz12);
1963
1964             /* Update vectorial force */
1965             fix1             = _mm256_add_ps(fix1,tx);
1966             fiy1             = _mm256_add_ps(fiy1,ty);
1967             fiz1             = _mm256_add_ps(fiz1,tz);
1968
1969             fjx2             = _mm256_add_ps(fjx2,tx);
1970             fjy2             = _mm256_add_ps(fjy2,ty);
1971             fjz2             = _mm256_add_ps(fjz2,tz);
1972
1973             }
1974
1975             /**************************
1976              * CALCULATE INTERACTIONS *
1977              **************************/
1978
1979             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1980             {
1981
1982             r13              = _mm256_mul_ps(rsq13,rinv13);
1983
1984             /* EWALD ELECTROSTATICS */
1985             
1986             /* Analytical PME correction */
1987             zeta2            = _mm256_mul_ps(beta2,rsq13);
1988             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
1989             pmecorrF         = avx256_pmecorrF_f(zeta2);
1990             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1991             felec            = _mm256_mul_ps(qq13,felec);
1992             pmecorrV         = avx256_pmecorrV_f(zeta2);
1993             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1994             velec            = _mm256_sub_ps(rinv13,pmecorrV);
1995             velec            = _mm256_mul_ps(qq13,velec);
1996             
1997             d                = _mm256_sub_ps(r13,rswitch);
1998             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1999             d2               = _mm256_mul_ps(d,d);
2000             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2001
2002             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2003
2004             /* Evaluate switch function */
2005             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2006             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2007             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2008
2009             fscal            = felec;
2010
2011             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2012
2013             /* Calculate temporary vectorial force */
2014             tx               = _mm256_mul_ps(fscal,dx13);
2015             ty               = _mm256_mul_ps(fscal,dy13);
2016             tz               = _mm256_mul_ps(fscal,dz13);
2017
2018             /* Update vectorial force */
2019             fix1             = _mm256_add_ps(fix1,tx);
2020             fiy1             = _mm256_add_ps(fiy1,ty);
2021             fiz1             = _mm256_add_ps(fiz1,tz);
2022
2023             fjx3             = _mm256_add_ps(fjx3,tx);
2024             fjy3             = _mm256_add_ps(fjy3,ty);
2025             fjz3             = _mm256_add_ps(fjz3,tz);
2026
2027             }
2028
2029             /**************************
2030              * CALCULATE INTERACTIONS *
2031              **************************/
2032
2033             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2034             {
2035
2036             r21              = _mm256_mul_ps(rsq21,rinv21);
2037
2038             /* EWALD ELECTROSTATICS */
2039             
2040             /* Analytical PME correction */
2041             zeta2            = _mm256_mul_ps(beta2,rsq21);
2042             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2043             pmecorrF         = avx256_pmecorrF_f(zeta2);
2044             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2045             felec            = _mm256_mul_ps(qq21,felec);
2046             pmecorrV         = avx256_pmecorrV_f(zeta2);
2047             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2048             velec            = _mm256_sub_ps(rinv21,pmecorrV);
2049             velec            = _mm256_mul_ps(qq21,velec);
2050             
2051             d                = _mm256_sub_ps(r21,rswitch);
2052             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2053             d2               = _mm256_mul_ps(d,d);
2054             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2055
2056             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2057
2058             /* Evaluate switch function */
2059             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2060             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2061             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2062
2063             fscal            = felec;
2064
2065             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2066
2067             /* Calculate temporary vectorial force */
2068             tx               = _mm256_mul_ps(fscal,dx21);
2069             ty               = _mm256_mul_ps(fscal,dy21);
2070             tz               = _mm256_mul_ps(fscal,dz21);
2071
2072             /* Update vectorial force */
2073             fix2             = _mm256_add_ps(fix2,tx);
2074             fiy2             = _mm256_add_ps(fiy2,ty);
2075             fiz2             = _mm256_add_ps(fiz2,tz);
2076
2077             fjx1             = _mm256_add_ps(fjx1,tx);
2078             fjy1             = _mm256_add_ps(fjy1,ty);
2079             fjz1             = _mm256_add_ps(fjz1,tz);
2080
2081             }
2082
2083             /**************************
2084              * CALCULATE INTERACTIONS *
2085              **************************/
2086
2087             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2088             {
2089
2090             r22              = _mm256_mul_ps(rsq22,rinv22);
2091
2092             /* EWALD ELECTROSTATICS */
2093             
2094             /* Analytical PME correction */
2095             zeta2            = _mm256_mul_ps(beta2,rsq22);
2096             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2097             pmecorrF         = avx256_pmecorrF_f(zeta2);
2098             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2099             felec            = _mm256_mul_ps(qq22,felec);
2100             pmecorrV         = avx256_pmecorrV_f(zeta2);
2101             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2102             velec            = _mm256_sub_ps(rinv22,pmecorrV);
2103             velec            = _mm256_mul_ps(qq22,velec);
2104             
2105             d                = _mm256_sub_ps(r22,rswitch);
2106             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2107             d2               = _mm256_mul_ps(d,d);
2108             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2109
2110             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2111
2112             /* Evaluate switch function */
2113             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2114             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2115             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2116
2117             fscal            = felec;
2118
2119             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2120
2121             /* Calculate temporary vectorial force */
2122             tx               = _mm256_mul_ps(fscal,dx22);
2123             ty               = _mm256_mul_ps(fscal,dy22);
2124             tz               = _mm256_mul_ps(fscal,dz22);
2125
2126             /* Update vectorial force */
2127             fix2             = _mm256_add_ps(fix2,tx);
2128             fiy2             = _mm256_add_ps(fiy2,ty);
2129             fiz2             = _mm256_add_ps(fiz2,tz);
2130
2131             fjx2             = _mm256_add_ps(fjx2,tx);
2132             fjy2             = _mm256_add_ps(fjy2,ty);
2133             fjz2             = _mm256_add_ps(fjz2,tz);
2134
2135             }
2136
2137             /**************************
2138              * CALCULATE INTERACTIONS *
2139              **************************/
2140
2141             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2142             {
2143
2144             r23              = _mm256_mul_ps(rsq23,rinv23);
2145
2146             /* EWALD ELECTROSTATICS */
2147             
2148             /* Analytical PME correction */
2149             zeta2            = _mm256_mul_ps(beta2,rsq23);
2150             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
2151             pmecorrF         = avx256_pmecorrF_f(zeta2);
2152             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2153             felec            = _mm256_mul_ps(qq23,felec);
2154             pmecorrV         = avx256_pmecorrV_f(zeta2);
2155             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2156             velec            = _mm256_sub_ps(rinv23,pmecorrV);
2157             velec            = _mm256_mul_ps(qq23,velec);
2158             
2159             d                = _mm256_sub_ps(r23,rswitch);
2160             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2161             d2               = _mm256_mul_ps(d,d);
2162             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2163
2164             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2165
2166             /* Evaluate switch function */
2167             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2168             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2169             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2170
2171             fscal            = felec;
2172
2173             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2174
2175             /* Calculate temporary vectorial force */
2176             tx               = _mm256_mul_ps(fscal,dx23);
2177             ty               = _mm256_mul_ps(fscal,dy23);
2178             tz               = _mm256_mul_ps(fscal,dz23);
2179
2180             /* Update vectorial force */
2181             fix2             = _mm256_add_ps(fix2,tx);
2182             fiy2             = _mm256_add_ps(fiy2,ty);
2183             fiz2             = _mm256_add_ps(fiz2,tz);
2184
2185             fjx3             = _mm256_add_ps(fjx3,tx);
2186             fjy3             = _mm256_add_ps(fjy3,ty);
2187             fjz3             = _mm256_add_ps(fjz3,tz);
2188
2189             }
2190
2191             /**************************
2192              * CALCULATE INTERACTIONS *
2193              **************************/
2194
2195             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2196             {
2197
2198             r31              = _mm256_mul_ps(rsq31,rinv31);
2199
2200             /* EWALD ELECTROSTATICS */
2201             
2202             /* Analytical PME correction */
2203             zeta2            = _mm256_mul_ps(beta2,rsq31);
2204             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
2205             pmecorrF         = avx256_pmecorrF_f(zeta2);
2206             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2207             felec            = _mm256_mul_ps(qq31,felec);
2208             pmecorrV         = avx256_pmecorrV_f(zeta2);
2209             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2210             velec            = _mm256_sub_ps(rinv31,pmecorrV);
2211             velec            = _mm256_mul_ps(qq31,velec);
2212             
2213             d                = _mm256_sub_ps(r31,rswitch);
2214             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2215             d2               = _mm256_mul_ps(d,d);
2216             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2217
2218             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2219
2220             /* Evaluate switch function */
2221             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2222             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2223             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2224
2225             fscal            = felec;
2226
2227             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2228
2229             /* Calculate temporary vectorial force */
2230             tx               = _mm256_mul_ps(fscal,dx31);
2231             ty               = _mm256_mul_ps(fscal,dy31);
2232             tz               = _mm256_mul_ps(fscal,dz31);
2233
2234             /* Update vectorial force */
2235             fix3             = _mm256_add_ps(fix3,tx);
2236             fiy3             = _mm256_add_ps(fiy3,ty);
2237             fiz3             = _mm256_add_ps(fiz3,tz);
2238
2239             fjx1             = _mm256_add_ps(fjx1,tx);
2240             fjy1             = _mm256_add_ps(fjy1,ty);
2241             fjz1             = _mm256_add_ps(fjz1,tz);
2242
2243             }
2244
2245             /**************************
2246              * CALCULATE INTERACTIONS *
2247              **************************/
2248
2249             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2250             {
2251
2252             r32              = _mm256_mul_ps(rsq32,rinv32);
2253
2254             /* EWALD ELECTROSTATICS */
2255             
2256             /* Analytical PME correction */
2257             zeta2            = _mm256_mul_ps(beta2,rsq32);
2258             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
2259             pmecorrF         = avx256_pmecorrF_f(zeta2);
2260             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2261             felec            = _mm256_mul_ps(qq32,felec);
2262             pmecorrV         = avx256_pmecorrV_f(zeta2);
2263             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2264             velec            = _mm256_sub_ps(rinv32,pmecorrV);
2265             velec            = _mm256_mul_ps(qq32,velec);
2266             
2267             d                = _mm256_sub_ps(r32,rswitch);
2268             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2269             d2               = _mm256_mul_ps(d,d);
2270             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2271
2272             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2273
2274             /* Evaluate switch function */
2275             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2276             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2277             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2278
2279             fscal            = felec;
2280
2281             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2282
2283             /* Calculate temporary vectorial force */
2284             tx               = _mm256_mul_ps(fscal,dx32);
2285             ty               = _mm256_mul_ps(fscal,dy32);
2286             tz               = _mm256_mul_ps(fscal,dz32);
2287
2288             /* Update vectorial force */
2289             fix3             = _mm256_add_ps(fix3,tx);
2290             fiy3             = _mm256_add_ps(fiy3,ty);
2291             fiz3             = _mm256_add_ps(fiz3,tz);
2292
2293             fjx2             = _mm256_add_ps(fjx2,tx);
2294             fjy2             = _mm256_add_ps(fjy2,ty);
2295             fjz2             = _mm256_add_ps(fjz2,tz);
2296
2297             }
2298
2299             /**************************
2300              * CALCULATE INTERACTIONS *
2301              **************************/
2302
2303             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2304             {
2305
2306             r33              = _mm256_mul_ps(rsq33,rinv33);
2307
2308             /* EWALD ELECTROSTATICS */
2309             
2310             /* Analytical PME correction */
2311             zeta2            = _mm256_mul_ps(beta2,rsq33);
2312             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
2313             pmecorrF         = avx256_pmecorrF_f(zeta2);
2314             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2315             felec            = _mm256_mul_ps(qq33,felec);
2316             pmecorrV         = avx256_pmecorrV_f(zeta2);
2317             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2318             velec            = _mm256_sub_ps(rinv33,pmecorrV);
2319             velec            = _mm256_mul_ps(qq33,velec);
2320             
2321             d                = _mm256_sub_ps(r33,rswitch);
2322             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2323             d2               = _mm256_mul_ps(d,d);
2324             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2325
2326             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2327
2328             /* Evaluate switch function */
2329             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2330             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2331             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2332
2333             fscal            = felec;
2334
2335             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2336
2337             /* Calculate temporary vectorial force */
2338             tx               = _mm256_mul_ps(fscal,dx33);
2339             ty               = _mm256_mul_ps(fscal,dy33);
2340             tz               = _mm256_mul_ps(fscal,dz33);
2341
2342             /* Update vectorial force */
2343             fix3             = _mm256_add_ps(fix3,tx);
2344             fiy3             = _mm256_add_ps(fiy3,ty);
2345             fiz3             = _mm256_add_ps(fiz3,tz);
2346
2347             fjx3             = _mm256_add_ps(fjx3,tx);
2348             fjy3             = _mm256_add_ps(fjy3,ty);
2349             fjz3             = _mm256_add_ps(fjz3,tz);
2350
2351             }
2352
2353             fjptrA             = f+j_coord_offsetA;
2354             fjptrB             = f+j_coord_offsetB;
2355             fjptrC             = f+j_coord_offsetC;
2356             fjptrD             = f+j_coord_offsetD;
2357             fjptrE             = f+j_coord_offsetE;
2358             fjptrF             = f+j_coord_offsetF;
2359             fjptrG             = f+j_coord_offsetG;
2360             fjptrH             = f+j_coord_offsetH;
2361
2362             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2363                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2364                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2365
2366             /* Inner loop uses 945 flops */
2367         }
2368
2369         if(jidx<j_index_end)
2370         {
2371
2372             /* Get j neighbor index, and coordinate index */
2373             jnrlistA         = jjnr[jidx];
2374             jnrlistB         = jjnr[jidx+1];
2375             jnrlistC         = jjnr[jidx+2];
2376             jnrlistD         = jjnr[jidx+3];
2377             jnrlistE         = jjnr[jidx+4];
2378             jnrlistF         = jjnr[jidx+5];
2379             jnrlistG         = jjnr[jidx+6];
2380             jnrlistH         = jjnr[jidx+7];
2381             /* Sign of each element will be negative for non-real atoms.
2382              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2383              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2384              */
2385             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2386                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2387                                             
2388             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2389             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2390             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2391             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2392             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2393             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2394             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2395             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2396             j_coord_offsetA  = DIM*jnrA;
2397             j_coord_offsetB  = DIM*jnrB;
2398             j_coord_offsetC  = DIM*jnrC;
2399             j_coord_offsetD  = DIM*jnrD;
2400             j_coord_offsetE  = DIM*jnrE;
2401             j_coord_offsetF  = DIM*jnrF;
2402             j_coord_offsetG  = DIM*jnrG;
2403             j_coord_offsetH  = DIM*jnrH;
2404
2405             /* load j atom coordinates */
2406             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2407                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2408                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
2409                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
2410                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2411
2412             /* Calculate displacement vector */
2413             dx11             = _mm256_sub_ps(ix1,jx1);
2414             dy11             = _mm256_sub_ps(iy1,jy1);
2415             dz11             = _mm256_sub_ps(iz1,jz1);
2416             dx12             = _mm256_sub_ps(ix1,jx2);
2417             dy12             = _mm256_sub_ps(iy1,jy2);
2418             dz12             = _mm256_sub_ps(iz1,jz2);
2419             dx13             = _mm256_sub_ps(ix1,jx3);
2420             dy13             = _mm256_sub_ps(iy1,jy3);
2421             dz13             = _mm256_sub_ps(iz1,jz3);
2422             dx21             = _mm256_sub_ps(ix2,jx1);
2423             dy21             = _mm256_sub_ps(iy2,jy1);
2424             dz21             = _mm256_sub_ps(iz2,jz1);
2425             dx22             = _mm256_sub_ps(ix2,jx2);
2426             dy22             = _mm256_sub_ps(iy2,jy2);
2427             dz22             = _mm256_sub_ps(iz2,jz2);
2428             dx23             = _mm256_sub_ps(ix2,jx3);
2429             dy23             = _mm256_sub_ps(iy2,jy3);
2430             dz23             = _mm256_sub_ps(iz2,jz3);
2431             dx31             = _mm256_sub_ps(ix3,jx1);
2432             dy31             = _mm256_sub_ps(iy3,jy1);
2433             dz31             = _mm256_sub_ps(iz3,jz1);
2434             dx32             = _mm256_sub_ps(ix3,jx2);
2435             dy32             = _mm256_sub_ps(iy3,jy2);
2436             dz32             = _mm256_sub_ps(iz3,jz2);
2437             dx33             = _mm256_sub_ps(ix3,jx3);
2438             dy33             = _mm256_sub_ps(iy3,jy3);
2439             dz33             = _mm256_sub_ps(iz3,jz3);
2440
2441             /* Calculate squared distance and things based on it */
2442             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2443             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2444             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2445             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2446             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2447             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2448             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2449             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2450             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2451
2452             rinv11           = avx256_invsqrt_f(rsq11);
2453             rinv12           = avx256_invsqrt_f(rsq12);
2454             rinv13           = avx256_invsqrt_f(rsq13);
2455             rinv21           = avx256_invsqrt_f(rsq21);
2456             rinv22           = avx256_invsqrt_f(rsq22);
2457             rinv23           = avx256_invsqrt_f(rsq23);
2458             rinv31           = avx256_invsqrt_f(rsq31);
2459             rinv32           = avx256_invsqrt_f(rsq32);
2460             rinv33           = avx256_invsqrt_f(rsq33);
2461
2462             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2463             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2464             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
2465             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2466             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2467             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
2468             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
2469             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
2470             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
2471
2472             fjx1             = _mm256_setzero_ps();
2473             fjy1             = _mm256_setzero_ps();
2474             fjz1             = _mm256_setzero_ps();
2475             fjx2             = _mm256_setzero_ps();
2476             fjy2             = _mm256_setzero_ps();
2477             fjz2             = _mm256_setzero_ps();
2478             fjx3             = _mm256_setzero_ps();
2479             fjy3             = _mm256_setzero_ps();
2480             fjz3             = _mm256_setzero_ps();
2481
2482             /**************************
2483              * CALCULATE INTERACTIONS *
2484              **************************/
2485
2486             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2487             {
2488
2489             r11              = _mm256_mul_ps(rsq11,rinv11);
2490             r11              = _mm256_andnot_ps(dummy_mask,r11);
2491
2492             /* EWALD ELECTROSTATICS */
2493             
2494             /* Analytical PME correction */
2495             zeta2            = _mm256_mul_ps(beta2,rsq11);
2496             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2497             pmecorrF         = avx256_pmecorrF_f(zeta2);
2498             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2499             felec            = _mm256_mul_ps(qq11,felec);
2500             pmecorrV         = avx256_pmecorrV_f(zeta2);
2501             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2502             velec            = _mm256_sub_ps(rinv11,pmecorrV);
2503             velec            = _mm256_mul_ps(qq11,velec);
2504             
2505             d                = _mm256_sub_ps(r11,rswitch);
2506             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2507             d2               = _mm256_mul_ps(d,d);
2508             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2509
2510             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2511
2512             /* Evaluate switch function */
2513             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2514             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2515             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2516
2517             fscal            = felec;
2518
2519             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2520
2521             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2522
2523             /* Calculate temporary vectorial force */
2524             tx               = _mm256_mul_ps(fscal,dx11);
2525             ty               = _mm256_mul_ps(fscal,dy11);
2526             tz               = _mm256_mul_ps(fscal,dz11);
2527
2528             /* Update vectorial force */
2529             fix1             = _mm256_add_ps(fix1,tx);
2530             fiy1             = _mm256_add_ps(fiy1,ty);
2531             fiz1             = _mm256_add_ps(fiz1,tz);
2532
2533             fjx1             = _mm256_add_ps(fjx1,tx);
2534             fjy1             = _mm256_add_ps(fjy1,ty);
2535             fjz1             = _mm256_add_ps(fjz1,tz);
2536
2537             }
2538
2539             /**************************
2540              * CALCULATE INTERACTIONS *
2541              **************************/
2542
2543             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2544             {
2545
2546             r12              = _mm256_mul_ps(rsq12,rinv12);
2547             r12              = _mm256_andnot_ps(dummy_mask,r12);
2548
2549             /* EWALD ELECTROSTATICS */
2550             
2551             /* Analytical PME correction */
2552             zeta2            = _mm256_mul_ps(beta2,rsq12);
2553             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2554             pmecorrF         = avx256_pmecorrF_f(zeta2);
2555             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2556             felec            = _mm256_mul_ps(qq12,felec);
2557             pmecorrV         = avx256_pmecorrV_f(zeta2);
2558             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2559             velec            = _mm256_sub_ps(rinv12,pmecorrV);
2560             velec            = _mm256_mul_ps(qq12,velec);
2561             
2562             d                = _mm256_sub_ps(r12,rswitch);
2563             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2564             d2               = _mm256_mul_ps(d,d);
2565             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2566
2567             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2568
2569             /* Evaluate switch function */
2570             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2571             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2572             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2573
2574             fscal            = felec;
2575
2576             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2577
2578             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2579
2580             /* Calculate temporary vectorial force */
2581             tx               = _mm256_mul_ps(fscal,dx12);
2582             ty               = _mm256_mul_ps(fscal,dy12);
2583             tz               = _mm256_mul_ps(fscal,dz12);
2584
2585             /* Update vectorial force */
2586             fix1             = _mm256_add_ps(fix1,tx);
2587             fiy1             = _mm256_add_ps(fiy1,ty);
2588             fiz1             = _mm256_add_ps(fiz1,tz);
2589
2590             fjx2             = _mm256_add_ps(fjx2,tx);
2591             fjy2             = _mm256_add_ps(fjy2,ty);
2592             fjz2             = _mm256_add_ps(fjz2,tz);
2593
2594             }
2595
2596             /**************************
2597              * CALCULATE INTERACTIONS *
2598              **************************/
2599
2600             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2601             {
2602
2603             r13              = _mm256_mul_ps(rsq13,rinv13);
2604             r13              = _mm256_andnot_ps(dummy_mask,r13);
2605
2606             /* EWALD ELECTROSTATICS */
2607             
2608             /* Analytical PME correction */
2609             zeta2            = _mm256_mul_ps(beta2,rsq13);
2610             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
2611             pmecorrF         = avx256_pmecorrF_f(zeta2);
2612             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2613             felec            = _mm256_mul_ps(qq13,felec);
2614             pmecorrV         = avx256_pmecorrV_f(zeta2);
2615             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2616             velec            = _mm256_sub_ps(rinv13,pmecorrV);
2617             velec            = _mm256_mul_ps(qq13,velec);
2618             
2619             d                = _mm256_sub_ps(r13,rswitch);
2620             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2621             d2               = _mm256_mul_ps(d,d);
2622             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2623
2624             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2625
2626             /* Evaluate switch function */
2627             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2628             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2629             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2630
2631             fscal            = felec;
2632
2633             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2634
2635             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2636
2637             /* Calculate temporary vectorial force */
2638             tx               = _mm256_mul_ps(fscal,dx13);
2639             ty               = _mm256_mul_ps(fscal,dy13);
2640             tz               = _mm256_mul_ps(fscal,dz13);
2641
2642             /* Update vectorial force */
2643             fix1             = _mm256_add_ps(fix1,tx);
2644             fiy1             = _mm256_add_ps(fiy1,ty);
2645             fiz1             = _mm256_add_ps(fiz1,tz);
2646
2647             fjx3             = _mm256_add_ps(fjx3,tx);
2648             fjy3             = _mm256_add_ps(fjy3,ty);
2649             fjz3             = _mm256_add_ps(fjz3,tz);
2650
2651             }
2652
2653             /**************************
2654              * CALCULATE INTERACTIONS *
2655              **************************/
2656
2657             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2658             {
2659
2660             r21              = _mm256_mul_ps(rsq21,rinv21);
2661             r21              = _mm256_andnot_ps(dummy_mask,r21);
2662
2663             /* EWALD ELECTROSTATICS */
2664             
2665             /* Analytical PME correction */
2666             zeta2            = _mm256_mul_ps(beta2,rsq21);
2667             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2668             pmecorrF         = avx256_pmecorrF_f(zeta2);
2669             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2670             felec            = _mm256_mul_ps(qq21,felec);
2671             pmecorrV         = avx256_pmecorrV_f(zeta2);
2672             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2673             velec            = _mm256_sub_ps(rinv21,pmecorrV);
2674             velec            = _mm256_mul_ps(qq21,velec);
2675             
2676             d                = _mm256_sub_ps(r21,rswitch);
2677             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2678             d2               = _mm256_mul_ps(d,d);
2679             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2680
2681             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2682
2683             /* Evaluate switch function */
2684             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2685             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2686             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2687
2688             fscal            = felec;
2689
2690             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2691
2692             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2693
2694             /* Calculate temporary vectorial force */
2695             tx               = _mm256_mul_ps(fscal,dx21);
2696             ty               = _mm256_mul_ps(fscal,dy21);
2697             tz               = _mm256_mul_ps(fscal,dz21);
2698
2699             /* Update vectorial force */
2700             fix2             = _mm256_add_ps(fix2,tx);
2701             fiy2             = _mm256_add_ps(fiy2,ty);
2702             fiz2             = _mm256_add_ps(fiz2,tz);
2703
2704             fjx1             = _mm256_add_ps(fjx1,tx);
2705             fjy1             = _mm256_add_ps(fjy1,ty);
2706             fjz1             = _mm256_add_ps(fjz1,tz);
2707
2708             }
2709
2710             /**************************
2711              * CALCULATE INTERACTIONS *
2712              **************************/
2713
2714             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2715             {
2716
2717             r22              = _mm256_mul_ps(rsq22,rinv22);
2718             r22              = _mm256_andnot_ps(dummy_mask,r22);
2719
2720             /* EWALD ELECTROSTATICS */
2721             
2722             /* Analytical PME correction */
2723             zeta2            = _mm256_mul_ps(beta2,rsq22);
2724             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2725             pmecorrF         = avx256_pmecorrF_f(zeta2);
2726             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2727             felec            = _mm256_mul_ps(qq22,felec);
2728             pmecorrV         = avx256_pmecorrV_f(zeta2);
2729             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2730             velec            = _mm256_sub_ps(rinv22,pmecorrV);
2731             velec            = _mm256_mul_ps(qq22,velec);
2732             
2733             d                = _mm256_sub_ps(r22,rswitch);
2734             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2735             d2               = _mm256_mul_ps(d,d);
2736             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2737
2738             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2739
2740             /* Evaluate switch function */
2741             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2742             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2743             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2744
2745             fscal            = felec;
2746
2747             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2748
2749             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2750
2751             /* Calculate temporary vectorial force */
2752             tx               = _mm256_mul_ps(fscal,dx22);
2753             ty               = _mm256_mul_ps(fscal,dy22);
2754             tz               = _mm256_mul_ps(fscal,dz22);
2755
2756             /* Update vectorial force */
2757             fix2             = _mm256_add_ps(fix2,tx);
2758             fiy2             = _mm256_add_ps(fiy2,ty);
2759             fiz2             = _mm256_add_ps(fiz2,tz);
2760
2761             fjx2             = _mm256_add_ps(fjx2,tx);
2762             fjy2             = _mm256_add_ps(fjy2,ty);
2763             fjz2             = _mm256_add_ps(fjz2,tz);
2764
2765             }
2766
2767             /**************************
2768              * CALCULATE INTERACTIONS *
2769              **************************/
2770
2771             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2772             {
2773
2774             r23              = _mm256_mul_ps(rsq23,rinv23);
2775             r23              = _mm256_andnot_ps(dummy_mask,r23);
2776
2777             /* EWALD ELECTROSTATICS */
2778             
2779             /* Analytical PME correction */
2780             zeta2            = _mm256_mul_ps(beta2,rsq23);
2781             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
2782             pmecorrF         = avx256_pmecorrF_f(zeta2);
2783             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2784             felec            = _mm256_mul_ps(qq23,felec);
2785             pmecorrV         = avx256_pmecorrV_f(zeta2);
2786             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2787             velec            = _mm256_sub_ps(rinv23,pmecorrV);
2788             velec            = _mm256_mul_ps(qq23,velec);
2789             
2790             d                = _mm256_sub_ps(r23,rswitch);
2791             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2792             d2               = _mm256_mul_ps(d,d);
2793             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2794
2795             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2796
2797             /* Evaluate switch function */
2798             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2799             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2800             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2801
2802             fscal            = felec;
2803
2804             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2805
2806             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2807
2808             /* Calculate temporary vectorial force */
2809             tx               = _mm256_mul_ps(fscal,dx23);
2810             ty               = _mm256_mul_ps(fscal,dy23);
2811             tz               = _mm256_mul_ps(fscal,dz23);
2812
2813             /* Update vectorial force */
2814             fix2             = _mm256_add_ps(fix2,tx);
2815             fiy2             = _mm256_add_ps(fiy2,ty);
2816             fiz2             = _mm256_add_ps(fiz2,tz);
2817
2818             fjx3             = _mm256_add_ps(fjx3,tx);
2819             fjy3             = _mm256_add_ps(fjy3,ty);
2820             fjz3             = _mm256_add_ps(fjz3,tz);
2821
2822             }
2823
2824             /**************************
2825              * CALCULATE INTERACTIONS *
2826              **************************/
2827
2828             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2829             {
2830
2831             r31              = _mm256_mul_ps(rsq31,rinv31);
2832             r31              = _mm256_andnot_ps(dummy_mask,r31);
2833
2834             /* EWALD ELECTROSTATICS */
2835             
2836             /* Analytical PME correction */
2837             zeta2            = _mm256_mul_ps(beta2,rsq31);
2838             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
2839             pmecorrF         = avx256_pmecorrF_f(zeta2);
2840             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2841             felec            = _mm256_mul_ps(qq31,felec);
2842             pmecorrV         = avx256_pmecorrV_f(zeta2);
2843             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2844             velec            = _mm256_sub_ps(rinv31,pmecorrV);
2845             velec            = _mm256_mul_ps(qq31,velec);
2846             
2847             d                = _mm256_sub_ps(r31,rswitch);
2848             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2849             d2               = _mm256_mul_ps(d,d);
2850             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2851
2852             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2853
2854             /* Evaluate switch function */
2855             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2856             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2857             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2858
2859             fscal            = felec;
2860
2861             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2862
2863             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2864
2865             /* Calculate temporary vectorial force */
2866             tx               = _mm256_mul_ps(fscal,dx31);
2867             ty               = _mm256_mul_ps(fscal,dy31);
2868             tz               = _mm256_mul_ps(fscal,dz31);
2869
2870             /* Update vectorial force */
2871             fix3             = _mm256_add_ps(fix3,tx);
2872             fiy3             = _mm256_add_ps(fiy3,ty);
2873             fiz3             = _mm256_add_ps(fiz3,tz);
2874
2875             fjx1             = _mm256_add_ps(fjx1,tx);
2876             fjy1             = _mm256_add_ps(fjy1,ty);
2877             fjz1             = _mm256_add_ps(fjz1,tz);
2878
2879             }
2880
2881             /**************************
2882              * CALCULATE INTERACTIONS *
2883              **************************/
2884
2885             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2886             {
2887
2888             r32              = _mm256_mul_ps(rsq32,rinv32);
2889             r32              = _mm256_andnot_ps(dummy_mask,r32);
2890
2891             /* EWALD ELECTROSTATICS */
2892             
2893             /* Analytical PME correction */
2894             zeta2            = _mm256_mul_ps(beta2,rsq32);
2895             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
2896             pmecorrF         = avx256_pmecorrF_f(zeta2);
2897             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2898             felec            = _mm256_mul_ps(qq32,felec);
2899             pmecorrV         = avx256_pmecorrV_f(zeta2);
2900             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2901             velec            = _mm256_sub_ps(rinv32,pmecorrV);
2902             velec            = _mm256_mul_ps(qq32,velec);
2903             
2904             d                = _mm256_sub_ps(r32,rswitch);
2905             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2906             d2               = _mm256_mul_ps(d,d);
2907             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2908
2909             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2910
2911             /* Evaluate switch function */
2912             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2913             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2914             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2915
2916             fscal            = felec;
2917
2918             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2919
2920             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2921
2922             /* Calculate temporary vectorial force */
2923             tx               = _mm256_mul_ps(fscal,dx32);
2924             ty               = _mm256_mul_ps(fscal,dy32);
2925             tz               = _mm256_mul_ps(fscal,dz32);
2926
2927             /* Update vectorial force */
2928             fix3             = _mm256_add_ps(fix3,tx);
2929             fiy3             = _mm256_add_ps(fiy3,ty);
2930             fiz3             = _mm256_add_ps(fiz3,tz);
2931
2932             fjx2             = _mm256_add_ps(fjx2,tx);
2933             fjy2             = _mm256_add_ps(fjy2,ty);
2934             fjz2             = _mm256_add_ps(fjz2,tz);
2935
2936             }
2937
2938             /**************************
2939              * CALCULATE INTERACTIONS *
2940              **************************/
2941
2942             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2943             {
2944
2945             r33              = _mm256_mul_ps(rsq33,rinv33);
2946             r33              = _mm256_andnot_ps(dummy_mask,r33);
2947
2948             /* EWALD ELECTROSTATICS */
2949             
2950             /* Analytical PME correction */
2951             zeta2            = _mm256_mul_ps(beta2,rsq33);
2952             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
2953             pmecorrF         = avx256_pmecorrF_f(zeta2);
2954             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2955             felec            = _mm256_mul_ps(qq33,felec);
2956             pmecorrV         = avx256_pmecorrV_f(zeta2);
2957             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2958             velec            = _mm256_sub_ps(rinv33,pmecorrV);
2959             velec            = _mm256_mul_ps(qq33,velec);
2960             
2961             d                = _mm256_sub_ps(r33,rswitch);
2962             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2963             d2               = _mm256_mul_ps(d,d);
2964             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2965
2966             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2967
2968             /* Evaluate switch function */
2969             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2970             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2971             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2972
2973             fscal            = felec;
2974
2975             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2976
2977             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2978
2979             /* Calculate temporary vectorial force */
2980             tx               = _mm256_mul_ps(fscal,dx33);
2981             ty               = _mm256_mul_ps(fscal,dy33);
2982             tz               = _mm256_mul_ps(fscal,dz33);
2983
2984             /* Update vectorial force */
2985             fix3             = _mm256_add_ps(fix3,tx);
2986             fiy3             = _mm256_add_ps(fiy3,ty);
2987             fiz3             = _mm256_add_ps(fiz3,tz);
2988
2989             fjx3             = _mm256_add_ps(fjx3,tx);
2990             fjy3             = _mm256_add_ps(fjy3,ty);
2991             fjz3             = _mm256_add_ps(fjz3,tz);
2992
2993             }
2994
2995             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2996             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2997             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2998             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2999             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3000             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3001             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3002             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3003
3004             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
3005                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
3006                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3007
3008             /* Inner loop uses 954 flops */
3009         }
3010
3011         /* End of innermost loop */
3012
3013         gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3014                                                  f+i_coord_offset+DIM,fshift+i_shift_offset);
3015
3016         /* Increment number of inner iterations */
3017         inneriter                  += j_index_end - j_index_start;
3018
3019         /* Outer loop uses 18 flops */
3020     }
3021
3022     /* Increment number of outer iterations */
3023     outeriter        += nri;
3024
3025     /* Update outer/inner flops */
3026
3027     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*954);
3028 }