ce0427bb96cf8c5f69c5e17b203116516e436763
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEwSw_VdwNone_GeomW4W4_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            None
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_VF_avx_256_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrE,jnrF,jnrG,jnrH;
76     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84     real             scratch[4*DIM];
85     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr1;
87     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88     real *           vdwioffsetptr2;
89     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90     real *           vdwioffsetptr3;
91     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
92     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
93     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
95     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
97     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
101     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
104     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
105     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
106     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
107     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
108     real             *charge;
109     __m256i          ewitab;
110     __m128i          ewitab_lo,ewitab_hi;
111     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
112     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
113     real             *ewtab;
114     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
115     real             rswitch_scalar,d_scalar;
116     __m256           dummy_mask,cutoff_mask;
117     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
118     __m256           one     = _mm256_set1_ps(1.0);
119     __m256           two     = _mm256_set1_ps(2.0);
120     x                = xx[0];
121     f                = ff[0];
122
123     nri              = nlist->nri;
124     iinr             = nlist->iinr;
125     jindex           = nlist->jindex;
126     jjnr             = nlist->jjnr;
127     shiftidx         = nlist->shift;
128     gid              = nlist->gid;
129     shiftvec         = fr->shift_vec[0];
130     fshift           = fr->fshift[0];
131     facel            = _mm256_set1_ps(fr->epsfac);
132     charge           = mdatoms->chargeA;
133
134     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
135     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
136     beta2            = _mm256_mul_ps(beta,beta);
137     beta3            = _mm256_mul_ps(beta,beta2);
138
139     ewtab            = fr->ic->tabq_coul_FDV0;
140     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
141     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
142
143     /* Setup water-specific parameters */
144     inr              = nlist->iinr[0];
145     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
146     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
147     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
148
149     jq1              = _mm256_set1_ps(charge[inr+1]);
150     jq2              = _mm256_set1_ps(charge[inr+2]);
151     jq3              = _mm256_set1_ps(charge[inr+3]);
152     qq11             = _mm256_mul_ps(iq1,jq1);
153     qq12             = _mm256_mul_ps(iq1,jq2);
154     qq13             = _mm256_mul_ps(iq1,jq3);
155     qq21             = _mm256_mul_ps(iq2,jq1);
156     qq22             = _mm256_mul_ps(iq2,jq2);
157     qq23             = _mm256_mul_ps(iq2,jq3);
158     qq31             = _mm256_mul_ps(iq3,jq1);
159     qq32             = _mm256_mul_ps(iq3,jq2);
160     qq33             = _mm256_mul_ps(iq3,jq3);
161
162     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
163     rcutoff_scalar   = fr->rcoulomb;
164     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
165     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
166
167     rswitch_scalar   = fr->rcoulomb_switch;
168     rswitch          = _mm256_set1_ps(rswitch_scalar);
169     /* Setup switch parameters */
170     d_scalar         = rcutoff_scalar-rswitch_scalar;
171     d                = _mm256_set1_ps(d_scalar);
172     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
173     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
174     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
175     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
176     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
177     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
178
179     /* Avoid stupid compiler warnings */
180     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
181     j_coord_offsetA = 0;
182     j_coord_offsetB = 0;
183     j_coord_offsetC = 0;
184     j_coord_offsetD = 0;
185     j_coord_offsetE = 0;
186     j_coord_offsetF = 0;
187     j_coord_offsetG = 0;
188     j_coord_offsetH = 0;
189
190     outeriter        = 0;
191     inneriter        = 0;
192
193     for(iidx=0;iidx<4*DIM;iidx++)
194     {
195         scratch[iidx] = 0.0;
196     }
197
198     /* Start outer loop over neighborlists */
199     for(iidx=0; iidx<nri; iidx++)
200     {
201         /* Load shift vector for this list */
202         i_shift_offset   = DIM*shiftidx[iidx];
203
204         /* Load limits for loop over neighbors */
205         j_index_start    = jindex[iidx];
206         j_index_end      = jindex[iidx+1];
207
208         /* Get outer coordinate index */
209         inr              = iinr[iidx];
210         i_coord_offset   = DIM*inr;
211
212         /* Load i particle coords and add shift vector */
213         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
214                                                     &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
215
216         fix1             = _mm256_setzero_ps();
217         fiy1             = _mm256_setzero_ps();
218         fiz1             = _mm256_setzero_ps();
219         fix2             = _mm256_setzero_ps();
220         fiy2             = _mm256_setzero_ps();
221         fiz2             = _mm256_setzero_ps();
222         fix3             = _mm256_setzero_ps();
223         fiy3             = _mm256_setzero_ps();
224         fiz3             = _mm256_setzero_ps();
225
226         /* Reset potential sums */
227         velecsum         = _mm256_setzero_ps();
228
229         /* Start inner kernel loop */
230         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
231         {
232
233             /* Get j neighbor index, and coordinate index */
234             jnrA             = jjnr[jidx];
235             jnrB             = jjnr[jidx+1];
236             jnrC             = jjnr[jidx+2];
237             jnrD             = jjnr[jidx+3];
238             jnrE             = jjnr[jidx+4];
239             jnrF             = jjnr[jidx+5];
240             jnrG             = jjnr[jidx+6];
241             jnrH             = jjnr[jidx+7];
242             j_coord_offsetA  = DIM*jnrA;
243             j_coord_offsetB  = DIM*jnrB;
244             j_coord_offsetC  = DIM*jnrC;
245             j_coord_offsetD  = DIM*jnrD;
246             j_coord_offsetE  = DIM*jnrE;
247             j_coord_offsetF  = DIM*jnrF;
248             j_coord_offsetG  = DIM*jnrG;
249             j_coord_offsetH  = DIM*jnrH;
250
251             /* load j atom coordinates */
252             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
253                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
254                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
255                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
256                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
257
258             /* Calculate displacement vector */
259             dx11             = _mm256_sub_ps(ix1,jx1);
260             dy11             = _mm256_sub_ps(iy1,jy1);
261             dz11             = _mm256_sub_ps(iz1,jz1);
262             dx12             = _mm256_sub_ps(ix1,jx2);
263             dy12             = _mm256_sub_ps(iy1,jy2);
264             dz12             = _mm256_sub_ps(iz1,jz2);
265             dx13             = _mm256_sub_ps(ix1,jx3);
266             dy13             = _mm256_sub_ps(iy1,jy3);
267             dz13             = _mm256_sub_ps(iz1,jz3);
268             dx21             = _mm256_sub_ps(ix2,jx1);
269             dy21             = _mm256_sub_ps(iy2,jy1);
270             dz21             = _mm256_sub_ps(iz2,jz1);
271             dx22             = _mm256_sub_ps(ix2,jx2);
272             dy22             = _mm256_sub_ps(iy2,jy2);
273             dz22             = _mm256_sub_ps(iz2,jz2);
274             dx23             = _mm256_sub_ps(ix2,jx3);
275             dy23             = _mm256_sub_ps(iy2,jy3);
276             dz23             = _mm256_sub_ps(iz2,jz3);
277             dx31             = _mm256_sub_ps(ix3,jx1);
278             dy31             = _mm256_sub_ps(iy3,jy1);
279             dz31             = _mm256_sub_ps(iz3,jz1);
280             dx32             = _mm256_sub_ps(ix3,jx2);
281             dy32             = _mm256_sub_ps(iy3,jy2);
282             dz32             = _mm256_sub_ps(iz3,jz2);
283             dx33             = _mm256_sub_ps(ix3,jx3);
284             dy33             = _mm256_sub_ps(iy3,jy3);
285             dz33             = _mm256_sub_ps(iz3,jz3);
286
287             /* Calculate squared distance and things based on it */
288             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
289             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
290             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
291             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
292             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
293             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
294             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
295             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
296             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
297
298             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
299             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
300             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
301             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
302             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
303             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
304             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
305             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
306             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
307
308             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
309             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
310             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
311             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
312             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
313             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
314             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
315             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
316             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
317
318             fjx1             = _mm256_setzero_ps();
319             fjy1             = _mm256_setzero_ps();
320             fjz1             = _mm256_setzero_ps();
321             fjx2             = _mm256_setzero_ps();
322             fjy2             = _mm256_setzero_ps();
323             fjz2             = _mm256_setzero_ps();
324             fjx3             = _mm256_setzero_ps();
325             fjy3             = _mm256_setzero_ps();
326             fjz3             = _mm256_setzero_ps();
327
328             /**************************
329              * CALCULATE INTERACTIONS *
330              **************************/
331
332             if (gmx_mm256_any_lt(rsq11,rcutoff2))
333             {
334
335             r11              = _mm256_mul_ps(rsq11,rinv11);
336
337             /* EWALD ELECTROSTATICS */
338             
339             /* Analytical PME correction */
340             zeta2            = _mm256_mul_ps(beta2,rsq11);
341             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
342             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
343             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
344             felec            = _mm256_mul_ps(qq11,felec);
345             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
346             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
347             velec            = _mm256_sub_ps(rinv11,pmecorrV);
348             velec            = _mm256_mul_ps(qq11,velec);
349             
350             d                = _mm256_sub_ps(r11,rswitch);
351             d                = _mm256_max_ps(d,_mm256_setzero_ps());
352             d2               = _mm256_mul_ps(d,d);
353             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
354
355             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
356
357             /* Evaluate switch function */
358             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
359             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
360             velec            = _mm256_mul_ps(velec,sw);
361             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
362
363             /* Update potential sum for this i atom from the interaction with this j atom. */
364             velec            = _mm256_and_ps(velec,cutoff_mask);
365             velecsum         = _mm256_add_ps(velecsum,velec);
366
367             fscal            = felec;
368
369             fscal            = _mm256_and_ps(fscal,cutoff_mask);
370
371             /* Calculate temporary vectorial force */
372             tx               = _mm256_mul_ps(fscal,dx11);
373             ty               = _mm256_mul_ps(fscal,dy11);
374             tz               = _mm256_mul_ps(fscal,dz11);
375
376             /* Update vectorial force */
377             fix1             = _mm256_add_ps(fix1,tx);
378             fiy1             = _mm256_add_ps(fiy1,ty);
379             fiz1             = _mm256_add_ps(fiz1,tz);
380
381             fjx1             = _mm256_add_ps(fjx1,tx);
382             fjy1             = _mm256_add_ps(fjy1,ty);
383             fjz1             = _mm256_add_ps(fjz1,tz);
384
385             }
386
387             /**************************
388              * CALCULATE INTERACTIONS *
389              **************************/
390
391             if (gmx_mm256_any_lt(rsq12,rcutoff2))
392             {
393
394             r12              = _mm256_mul_ps(rsq12,rinv12);
395
396             /* EWALD ELECTROSTATICS */
397             
398             /* Analytical PME correction */
399             zeta2            = _mm256_mul_ps(beta2,rsq12);
400             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
401             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
402             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
403             felec            = _mm256_mul_ps(qq12,felec);
404             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
405             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
406             velec            = _mm256_sub_ps(rinv12,pmecorrV);
407             velec            = _mm256_mul_ps(qq12,velec);
408             
409             d                = _mm256_sub_ps(r12,rswitch);
410             d                = _mm256_max_ps(d,_mm256_setzero_ps());
411             d2               = _mm256_mul_ps(d,d);
412             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
413
414             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
415
416             /* Evaluate switch function */
417             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
418             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
419             velec            = _mm256_mul_ps(velec,sw);
420             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
421
422             /* Update potential sum for this i atom from the interaction with this j atom. */
423             velec            = _mm256_and_ps(velec,cutoff_mask);
424             velecsum         = _mm256_add_ps(velecsum,velec);
425
426             fscal            = felec;
427
428             fscal            = _mm256_and_ps(fscal,cutoff_mask);
429
430             /* Calculate temporary vectorial force */
431             tx               = _mm256_mul_ps(fscal,dx12);
432             ty               = _mm256_mul_ps(fscal,dy12);
433             tz               = _mm256_mul_ps(fscal,dz12);
434
435             /* Update vectorial force */
436             fix1             = _mm256_add_ps(fix1,tx);
437             fiy1             = _mm256_add_ps(fiy1,ty);
438             fiz1             = _mm256_add_ps(fiz1,tz);
439
440             fjx2             = _mm256_add_ps(fjx2,tx);
441             fjy2             = _mm256_add_ps(fjy2,ty);
442             fjz2             = _mm256_add_ps(fjz2,tz);
443
444             }
445
446             /**************************
447              * CALCULATE INTERACTIONS *
448              **************************/
449
450             if (gmx_mm256_any_lt(rsq13,rcutoff2))
451             {
452
453             r13              = _mm256_mul_ps(rsq13,rinv13);
454
455             /* EWALD ELECTROSTATICS */
456             
457             /* Analytical PME correction */
458             zeta2            = _mm256_mul_ps(beta2,rsq13);
459             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
460             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
461             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
462             felec            = _mm256_mul_ps(qq13,felec);
463             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
464             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
465             velec            = _mm256_sub_ps(rinv13,pmecorrV);
466             velec            = _mm256_mul_ps(qq13,velec);
467             
468             d                = _mm256_sub_ps(r13,rswitch);
469             d                = _mm256_max_ps(d,_mm256_setzero_ps());
470             d2               = _mm256_mul_ps(d,d);
471             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
472
473             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
474
475             /* Evaluate switch function */
476             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
477             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
478             velec            = _mm256_mul_ps(velec,sw);
479             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
480
481             /* Update potential sum for this i atom from the interaction with this j atom. */
482             velec            = _mm256_and_ps(velec,cutoff_mask);
483             velecsum         = _mm256_add_ps(velecsum,velec);
484
485             fscal            = felec;
486
487             fscal            = _mm256_and_ps(fscal,cutoff_mask);
488
489             /* Calculate temporary vectorial force */
490             tx               = _mm256_mul_ps(fscal,dx13);
491             ty               = _mm256_mul_ps(fscal,dy13);
492             tz               = _mm256_mul_ps(fscal,dz13);
493
494             /* Update vectorial force */
495             fix1             = _mm256_add_ps(fix1,tx);
496             fiy1             = _mm256_add_ps(fiy1,ty);
497             fiz1             = _mm256_add_ps(fiz1,tz);
498
499             fjx3             = _mm256_add_ps(fjx3,tx);
500             fjy3             = _mm256_add_ps(fjy3,ty);
501             fjz3             = _mm256_add_ps(fjz3,tz);
502
503             }
504
505             /**************************
506              * CALCULATE INTERACTIONS *
507              **************************/
508
509             if (gmx_mm256_any_lt(rsq21,rcutoff2))
510             {
511
512             r21              = _mm256_mul_ps(rsq21,rinv21);
513
514             /* EWALD ELECTROSTATICS */
515             
516             /* Analytical PME correction */
517             zeta2            = _mm256_mul_ps(beta2,rsq21);
518             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
519             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
520             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
521             felec            = _mm256_mul_ps(qq21,felec);
522             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
523             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
524             velec            = _mm256_sub_ps(rinv21,pmecorrV);
525             velec            = _mm256_mul_ps(qq21,velec);
526             
527             d                = _mm256_sub_ps(r21,rswitch);
528             d                = _mm256_max_ps(d,_mm256_setzero_ps());
529             d2               = _mm256_mul_ps(d,d);
530             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
531
532             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
533
534             /* Evaluate switch function */
535             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
536             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
537             velec            = _mm256_mul_ps(velec,sw);
538             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
539
540             /* Update potential sum for this i atom from the interaction with this j atom. */
541             velec            = _mm256_and_ps(velec,cutoff_mask);
542             velecsum         = _mm256_add_ps(velecsum,velec);
543
544             fscal            = felec;
545
546             fscal            = _mm256_and_ps(fscal,cutoff_mask);
547
548             /* Calculate temporary vectorial force */
549             tx               = _mm256_mul_ps(fscal,dx21);
550             ty               = _mm256_mul_ps(fscal,dy21);
551             tz               = _mm256_mul_ps(fscal,dz21);
552
553             /* Update vectorial force */
554             fix2             = _mm256_add_ps(fix2,tx);
555             fiy2             = _mm256_add_ps(fiy2,ty);
556             fiz2             = _mm256_add_ps(fiz2,tz);
557
558             fjx1             = _mm256_add_ps(fjx1,tx);
559             fjy1             = _mm256_add_ps(fjy1,ty);
560             fjz1             = _mm256_add_ps(fjz1,tz);
561
562             }
563
564             /**************************
565              * CALCULATE INTERACTIONS *
566              **************************/
567
568             if (gmx_mm256_any_lt(rsq22,rcutoff2))
569             {
570
571             r22              = _mm256_mul_ps(rsq22,rinv22);
572
573             /* EWALD ELECTROSTATICS */
574             
575             /* Analytical PME correction */
576             zeta2            = _mm256_mul_ps(beta2,rsq22);
577             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
578             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
579             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
580             felec            = _mm256_mul_ps(qq22,felec);
581             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
582             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
583             velec            = _mm256_sub_ps(rinv22,pmecorrV);
584             velec            = _mm256_mul_ps(qq22,velec);
585             
586             d                = _mm256_sub_ps(r22,rswitch);
587             d                = _mm256_max_ps(d,_mm256_setzero_ps());
588             d2               = _mm256_mul_ps(d,d);
589             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
590
591             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
592
593             /* Evaluate switch function */
594             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
595             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
596             velec            = _mm256_mul_ps(velec,sw);
597             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
598
599             /* Update potential sum for this i atom from the interaction with this j atom. */
600             velec            = _mm256_and_ps(velec,cutoff_mask);
601             velecsum         = _mm256_add_ps(velecsum,velec);
602
603             fscal            = felec;
604
605             fscal            = _mm256_and_ps(fscal,cutoff_mask);
606
607             /* Calculate temporary vectorial force */
608             tx               = _mm256_mul_ps(fscal,dx22);
609             ty               = _mm256_mul_ps(fscal,dy22);
610             tz               = _mm256_mul_ps(fscal,dz22);
611
612             /* Update vectorial force */
613             fix2             = _mm256_add_ps(fix2,tx);
614             fiy2             = _mm256_add_ps(fiy2,ty);
615             fiz2             = _mm256_add_ps(fiz2,tz);
616
617             fjx2             = _mm256_add_ps(fjx2,tx);
618             fjy2             = _mm256_add_ps(fjy2,ty);
619             fjz2             = _mm256_add_ps(fjz2,tz);
620
621             }
622
623             /**************************
624              * CALCULATE INTERACTIONS *
625              **************************/
626
627             if (gmx_mm256_any_lt(rsq23,rcutoff2))
628             {
629
630             r23              = _mm256_mul_ps(rsq23,rinv23);
631
632             /* EWALD ELECTROSTATICS */
633             
634             /* Analytical PME correction */
635             zeta2            = _mm256_mul_ps(beta2,rsq23);
636             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
637             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
638             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
639             felec            = _mm256_mul_ps(qq23,felec);
640             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
641             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
642             velec            = _mm256_sub_ps(rinv23,pmecorrV);
643             velec            = _mm256_mul_ps(qq23,velec);
644             
645             d                = _mm256_sub_ps(r23,rswitch);
646             d                = _mm256_max_ps(d,_mm256_setzero_ps());
647             d2               = _mm256_mul_ps(d,d);
648             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
649
650             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
651
652             /* Evaluate switch function */
653             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
654             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
655             velec            = _mm256_mul_ps(velec,sw);
656             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
657
658             /* Update potential sum for this i atom from the interaction with this j atom. */
659             velec            = _mm256_and_ps(velec,cutoff_mask);
660             velecsum         = _mm256_add_ps(velecsum,velec);
661
662             fscal            = felec;
663
664             fscal            = _mm256_and_ps(fscal,cutoff_mask);
665
666             /* Calculate temporary vectorial force */
667             tx               = _mm256_mul_ps(fscal,dx23);
668             ty               = _mm256_mul_ps(fscal,dy23);
669             tz               = _mm256_mul_ps(fscal,dz23);
670
671             /* Update vectorial force */
672             fix2             = _mm256_add_ps(fix2,tx);
673             fiy2             = _mm256_add_ps(fiy2,ty);
674             fiz2             = _mm256_add_ps(fiz2,tz);
675
676             fjx3             = _mm256_add_ps(fjx3,tx);
677             fjy3             = _mm256_add_ps(fjy3,ty);
678             fjz3             = _mm256_add_ps(fjz3,tz);
679
680             }
681
682             /**************************
683              * CALCULATE INTERACTIONS *
684              **************************/
685
686             if (gmx_mm256_any_lt(rsq31,rcutoff2))
687             {
688
689             r31              = _mm256_mul_ps(rsq31,rinv31);
690
691             /* EWALD ELECTROSTATICS */
692             
693             /* Analytical PME correction */
694             zeta2            = _mm256_mul_ps(beta2,rsq31);
695             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
696             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
697             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
698             felec            = _mm256_mul_ps(qq31,felec);
699             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
700             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
701             velec            = _mm256_sub_ps(rinv31,pmecorrV);
702             velec            = _mm256_mul_ps(qq31,velec);
703             
704             d                = _mm256_sub_ps(r31,rswitch);
705             d                = _mm256_max_ps(d,_mm256_setzero_ps());
706             d2               = _mm256_mul_ps(d,d);
707             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
708
709             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
710
711             /* Evaluate switch function */
712             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
713             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
714             velec            = _mm256_mul_ps(velec,sw);
715             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
716
717             /* Update potential sum for this i atom from the interaction with this j atom. */
718             velec            = _mm256_and_ps(velec,cutoff_mask);
719             velecsum         = _mm256_add_ps(velecsum,velec);
720
721             fscal            = felec;
722
723             fscal            = _mm256_and_ps(fscal,cutoff_mask);
724
725             /* Calculate temporary vectorial force */
726             tx               = _mm256_mul_ps(fscal,dx31);
727             ty               = _mm256_mul_ps(fscal,dy31);
728             tz               = _mm256_mul_ps(fscal,dz31);
729
730             /* Update vectorial force */
731             fix3             = _mm256_add_ps(fix3,tx);
732             fiy3             = _mm256_add_ps(fiy3,ty);
733             fiz3             = _mm256_add_ps(fiz3,tz);
734
735             fjx1             = _mm256_add_ps(fjx1,tx);
736             fjy1             = _mm256_add_ps(fjy1,ty);
737             fjz1             = _mm256_add_ps(fjz1,tz);
738
739             }
740
741             /**************************
742              * CALCULATE INTERACTIONS *
743              **************************/
744
745             if (gmx_mm256_any_lt(rsq32,rcutoff2))
746             {
747
748             r32              = _mm256_mul_ps(rsq32,rinv32);
749
750             /* EWALD ELECTROSTATICS */
751             
752             /* Analytical PME correction */
753             zeta2            = _mm256_mul_ps(beta2,rsq32);
754             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
755             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
756             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
757             felec            = _mm256_mul_ps(qq32,felec);
758             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
759             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
760             velec            = _mm256_sub_ps(rinv32,pmecorrV);
761             velec            = _mm256_mul_ps(qq32,velec);
762             
763             d                = _mm256_sub_ps(r32,rswitch);
764             d                = _mm256_max_ps(d,_mm256_setzero_ps());
765             d2               = _mm256_mul_ps(d,d);
766             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
767
768             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
769
770             /* Evaluate switch function */
771             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
772             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
773             velec            = _mm256_mul_ps(velec,sw);
774             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
775
776             /* Update potential sum for this i atom from the interaction with this j atom. */
777             velec            = _mm256_and_ps(velec,cutoff_mask);
778             velecsum         = _mm256_add_ps(velecsum,velec);
779
780             fscal            = felec;
781
782             fscal            = _mm256_and_ps(fscal,cutoff_mask);
783
784             /* Calculate temporary vectorial force */
785             tx               = _mm256_mul_ps(fscal,dx32);
786             ty               = _mm256_mul_ps(fscal,dy32);
787             tz               = _mm256_mul_ps(fscal,dz32);
788
789             /* Update vectorial force */
790             fix3             = _mm256_add_ps(fix3,tx);
791             fiy3             = _mm256_add_ps(fiy3,ty);
792             fiz3             = _mm256_add_ps(fiz3,tz);
793
794             fjx2             = _mm256_add_ps(fjx2,tx);
795             fjy2             = _mm256_add_ps(fjy2,ty);
796             fjz2             = _mm256_add_ps(fjz2,tz);
797
798             }
799
800             /**************************
801              * CALCULATE INTERACTIONS *
802              **************************/
803
804             if (gmx_mm256_any_lt(rsq33,rcutoff2))
805             {
806
807             r33              = _mm256_mul_ps(rsq33,rinv33);
808
809             /* EWALD ELECTROSTATICS */
810             
811             /* Analytical PME correction */
812             zeta2            = _mm256_mul_ps(beta2,rsq33);
813             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
814             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
815             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
816             felec            = _mm256_mul_ps(qq33,felec);
817             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
818             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
819             velec            = _mm256_sub_ps(rinv33,pmecorrV);
820             velec            = _mm256_mul_ps(qq33,velec);
821             
822             d                = _mm256_sub_ps(r33,rswitch);
823             d                = _mm256_max_ps(d,_mm256_setzero_ps());
824             d2               = _mm256_mul_ps(d,d);
825             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
826
827             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
828
829             /* Evaluate switch function */
830             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
831             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
832             velec            = _mm256_mul_ps(velec,sw);
833             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
834
835             /* Update potential sum for this i atom from the interaction with this j atom. */
836             velec            = _mm256_and_ps(velec,cutoff_mask);
837             velecsum         = _mm256_add_ps(velecsum,velec);
838
839             fscal            = felec;
840
841             fscal            = _mm256_and_ps(fscal,cutoff_mask);
842
843             /* Calculate temporary vectorial force */
844             tx               = _mm256_mul_ps(fscal,dx33);
845             ty               = _mm256_mul_ps(fscal,dy33);
846             tz               = _mm256_mul_ps(fscal,dz33);
847
848             /* Update vectorial force */
849             fix3             = _mm256_add_ps(fix3,tx);
850             fiy3             = _mm256_add_ps(fiy3,ty);
851             fiz3             = _mm256_add_ps(fiz3,tz);
852
853             fjx3             = _mm256_add_ps(fjx3,tx);
854             fjy3             = _mm256_add_ps(fjy3,ty);
855             fjz3             = _mm256_add_ps(fjz3,tz);
856
857             }
858
859             fjptrA             = f+j_coord_offsetA;
860             fjptrB             = f+j_coord_offsetB;
861             fjptrC             = f+j_coord_offsetC;
862             fjptrD             = f+j_coord_offsetD;
863             fjptrE             = f+j_coord_offsetE;
864             fjptrF             = f+j_coord_offsetF;
865             fjptrG             = f+j_coord_offsetG;
866             fjptrH             = f+j_coord_offsetH;
867
868             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
869                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
870                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
871
872             /* Inner loop uses 972 flops */
873         }
874
875         if(jidx<j_index_end)
876         {
877
878             /* Get j neighbor index, and coordinate index */
879             jnrlistA         = jjnr[jidx];
880             jnrlistB         = jjnr[jidx+1];
881             jnrlistC         = jjnr[jidx+2];
882             jnrlistD         = jjnr[jidx+3];
883             jnrlistE         = jjnr[jidx+4];
884             jnrlistF         = jjnr[jidx+5];
885             jnrlistG         = jjnr[jidx+6];
886             jnrlistH         = jjnr[jidx+7];
887             /* Sign of each element will be negative for non-real atoms.
888              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
889              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
890              */
891             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
892                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
893                                             
894             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
895             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
896             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
897             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
898             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
899             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
900             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
901             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
902             j_coord_offsetA  = DIM*jnrA;
903             j_coord_offsetB  = DIM*jnrB;
904             j_coord_offsetC  = DIM*jnrC;
905             j_coord_offsetD  = DIM*jnrD;
906             j_coord_offsetE  = DIM*jnrE;
907             j_coord_offsetF  = DIM*jnrF;
908             j_coord_offsetG  = DIM*jnrG;
909             j_coord_offsetH  = DIM*jnrH;
910
911             /* load j atom coordinates */
912             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
913                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
914                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
915                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
916                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
917
918             /* Calculate displacement vector */
919             dx11             = _mm256_sub_ps(ix1,jx1);
920             dy11             = _mm256_sub_ps(iy1,jy1);
921             dz11             = _mm256_sub_ps(iz1,jz1);
922             dx12             = _mm256_sub_ps(ix1,jx2);
923             dy12             = _mm256_sub_ps(iy1,jy2);
924             dz12             = _mm256_sub_ps(iz1,jz2);
925             dx13             = _mm256_sub_ps(ix1,jx3);
926             dy13             = _mm256_sub_ps(iy1,jy3);
927             dz13             = _mm256_sub_ps(iz1,jz3);
928             dx21             = _mm256_sub_ps(ix2,jx1);
929             dy21             = _mm256_sub_ps(iy2,jy1);
930             dz21             = _mm256_sub_ps(iz2,jz1);
931             dx22             = _mm256_sub_ps(ix2,jx2);
932             dy22             = _mm256_sub_ps(iy2,jy2);
933             dz22             = _mm256_sub_ps(iz2,jz2);
934             dx23             = _mm256_sub_ps(ix2,jx3);
935             dy23             = _mm256_sub_ps(iy2,jy3);
936             dz23             = _mm256_sub_ps(iz2,jz3);
937             dx31             = _mm256_sub_ps(ix3,jx1);
938             dy31             = _mm256_sub_ps(iy3,jy1);
939             dz31             = _mm256_sub_ps(iz3,jz1);
940             dx32             = _mm256_sub_ps(ix3,jx2);
941             dy32             = _mm256_sub_ps(iy3,jy2);
942             dz32             = _mm256_sub_ps(iz3,jz2);
943             dx33             = _mm256_sub_ps(ix3,jx3);
944             dy33             = _mm256_sub_ps(iy3,jy3);
945             dz33             = _mm256_sub_ps(iz3,jz3);
946
947             /* Calculate squared distance and things based on it */
948             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
949             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
950             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
951             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
952             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
953             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
954             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
955             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
956             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
957
958             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
959             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
960             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
961             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
962             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
963             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
964             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
965             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
966             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
967
968             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
969             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
970             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
971             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
972             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
973             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
974             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
975             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
976             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
977
978             fjx1             = _mm256_setzero_ps();
979             fjy1             = _mm256_setzero_ps();
980             fjz1             = _mm256_setzero_ps();
981             fjx2             = _mm256_setzero_ps();
982             fjy2             = _mm256_setzero_ps();
983             fjz2             = _mm256_setzero_ps();
984             fjx3             = _mm256_setzero_ps();
985             fjy3             = _mm256_setzero_ps();
986             fjz3             = _mm256_setzero_ps();
987
988             /**************************
989              * CALCULATE INTERACTIONS *
990              **************************/
991
992             if (gmx_mm256_any_lt(rsq11,rcutoff2))
993             {
994
995             r11              = _mm256_mul_ps(rsq11,rinv11);
996             r11              = _mm256_andnot_ps(dummy_mask,r11);
997
998             /* EWALD ELECTROSTATICS */
999             
1000             /* Analytical PME correction */
1001             zeta2            = _mm256_mul_ps(beta2,rsq11);
1002             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1003             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1004             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1005             felec            = _mm256_mul_ps(qq11,felec);
1006             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1007             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1008             velec            = _mm256_sub_ps(rinv11,pmecorrV);
1009             velec            = _mm256_mul_ps(qq11,velec);
1010             
1011             d                = _mm256_sub_ps(r11,rswitch);
1012             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1013             d2               = _mm256_mul_ps(d,d);
1014             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1015
1016             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1017
1018             /* Evaluate switch function */
1019             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1020             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1021             velec            = _mm256_mul_ps(velec,sw);
1022             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1023
1024             /* Update potential sum for this i atom from the interaction with this j atom. */
1025             velec            = _mm256_and_ps(velec,cutoff_mask);
1026             velec            = _mm256_andnot_ps(dummy_mask,velec);
1027             velecsum         = _mm256_add_ps(velecsum,velec);
1028
1029             fscal            = felec;
1030
1031             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1032
1033             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1034
1035             /* Calculate temporary vectorial force */
1036             tx               = _mm256_mul_ps(fscal,dx11);
1037             ty               = _mm256_mul_ps(fscal,dy11);
1038             tz               = _mm256_mul_ps(fscal,dz11);
1039
1040             /* Update vectorial force */
1041             fix1             = _mm256_add_ps(fix1,tx);
1042             fiy1             = _mm256_add_ps(fiy1,ty);
1043             fiz1             = _mm256_add_ps(fiz1,tz);
1044
1045             fjx1             = _mm256_add_ps(fjx1,tx);
1046             fjy1             = _mm256_add_ps(fjy1,ty);
1047             fjz1             = _mm256_add_ps(fjz1,tz);
1048
1049             }
1050
1051             /**************************
1052              * CALCULATE INTERACTIONS *
1053              **************************/
1054
1055             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1056             {
1057
1058             r12              = _mm256_mul_ps(rsq12,rinv12);
1059             r12              = _mm256_andnot_ps(dummy_mask,r12);
1060
1061             /* EWALD ELECTROSTATICS */
1062             
1063             /* Analytical PME correction */
1064             zeta2            = _mm256_mul_ps(beta2,rsq12);
1065             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1066             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1067             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1068             felec            = _mm256_mul_ps(qq12,felec);
1069             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1070             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1071             velec            = _mm256_sub_ps(rinv12,pmecorrV);
1072             velec            = _mm256_mul_ps(qq12,velec);
1073             
1074             d                = _mm256_sub_ps(r12,rswitch);
1075             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1076             d2               = _mm256_mul_ps(d,d);
1077             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1078
1079             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1080
1081             /* Evaluate switch function */
1082             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1083             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1084             velec            = _mm256_mul_ps(velec,sw);
1085             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1086
1087             /* Update potential sum for this i atom from the interaction with this j atom. */
1088             velec            = _mm256_and_ps(velec,cutoff_mask);
1089             velec            = _mm256_andnot_ps(dummy_mask,velec);
1090             velecsum         = _mm256_add_ps(velecsum,velec);
1091
1092             fscal            = felec;
1093
1094             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1095
1096             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1097
1098             /* Calculate temporary vectorial force */
1099             tx               = _mm256_mul_ps(fscal,dx12);
1100             ty               = _mm256_mul_ps(fscal,dy12);
1101             tz               = _mm256_mul_ps(fscal,dz12);
1102
1103             /* Update vectorial force */
1104             fix1             = _mm256_add_ps(fix1,tx);
1105             fiy1             = _mm256_add_ps(fiy1,ty);
1106             fiz1             = _mm256_add_ps(fiz1,tz);
1107
1108             fjx2             = _mm256_add_ps(fjx2,tx);
1109             fjy2             = _mm256_add_ps(fjy2,ty);
1110             fjz2             = _mm256_add_ps(fjz2,tz);
1111
1112             }
1113
1114             /**************************
1115              * CALCULATE INTERACTIONS *
1116              **************************/
1117
1118             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1119             {
1120
1121             r13              = _mm256_mul_ps(rsq13,rinv13);
1122             r13              = _mm256_andnot_ps(dummy_mask,r13);
1123
1124             /* EWALD ELECTROSTATICS */
1125             
1126             /* Analytical PME correction */
1127             zeta2            = _mm256_mul_ps(beta2,rsq13);
1128             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
1129             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1130             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1131             felec            = _mm256_mul_ps(qq13,felec);
1132             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1133             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1134             velec            = _mm256_sub_ps(rinv13,pmecorrV);
1135             velec            = _mm256_mul_ps(qq13,velec);
1136             
1137             d                = _mm256_sub_ps(r13,rswitch);
1138             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1139             d2               = _mm256_mul_ps(d,d);
1140             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1141
1142             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1143
1144             /* Evaluate switch function */
1145             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1146             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
1147             velec            = _mm256_mul_ps(velec,sw);
1148             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1149
1150             /* Update potential sum for this i atom from the interaction with this j atom. */
1151             velec            = _mm256_and_ps(velec,cutoff_mask);
1152             velec            = _mm256_andnot_ps(dummy_mask,velec);
1153             velecsum         = _mm256_add_ps(velecsum,velec);
1154
1155             fscal            = felec;
1156
1157             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1158
1159             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1160
1161             /* Calculate temporary vectorial force */
1162             tx               = _mm256_mul_ps(fscal,dx13);
1163             ty               = _mm256_mul_ps(fscal,dy13);
1164             tz               = _mm256_mul_ps(fscal,dz13);
1165
1166             /* Update vectorial force */
1167             fix1             = _mm256_add_ps(fix1,tx);
1168             fiy1             = _mm256_add_ps(fiy1,ty);
1169             fiz1             = _mm256_add_ps(fiz1,tz);
1170
1171             fjx3             = _mm256_add_ps(fjx3,tx);
1172             fjy3             = _mm256_add_ps(fjy3,ty);
1173             fjz3             = _mm256_add_ps(fjz3,tz);
1174
1175             }
1176
1177             /**************************
1178              * CALCULATE INTERACTIONS *
1179              **************************/
1180
1181             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1182             {
1183
1184             r21              = _mm256_mul_ps(rsq21,rinv21);
1185             r21              = _mm256_andnot_ps(dummy_mask,r21);
1186
1187             /* EWALD ELECTROSTATICS */
1188             
1189             /* Analytical PME correction */
1190             zeta2            = _mm256_mul_ps(beta2,rsq21);
1191             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1192             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1193             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1194             felec            = _mm256_mul_ps(qq21,felec);
1195             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1196             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1197             velec            = _mm256_sub_ps(rinv21,pmecorrV);
1198             velec            = _mm256_mul_ps(qq21,velec);
1199             
1200             d                = _mm256_sub_ps(r21,rswitch);
1201             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1202             d2               = _mm256_mul_ps(d,d);
1203             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1204
1205             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1206
1207             /* Evaluate switch function */
1208             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1209             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
1210             velec            = _mm256_mul_ps(velec,sw);
1211             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1212
1213             /* Update potential sum for this i atom from the interaction with this j atom. */
1214             velec            = _mm256_and_ps(velec,cutoff_mask);
1215             velec            = _mm256_andnot_ps(dummy_mask,velec);
1216             velecsum         = _mm256_add_ps(velecsum,velec);
1217
1218             fscal            = felec;
1219
1220             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1221
1222             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1223
1224             /* Calculate temporary vectorial force */
1225             tx               = _mm256_mul_ps(fscal,dx21);
1226             ty               = _mm256_mul_ps(fscal,dy21);
1227             tz               = _mm256_mul_ps(fscal,dz21);
1228
1229             /* Update vectorial force */
1230             fix2             = _mm256_add_ps(fix2,tx);
1231             fiy2             = _mm256_add_ps(fiy2,ty);
1232             fiz2             = _mm256_add_ps(fiz2,tz);
1233
1234             fjx1             = _mm256_add_ps(fjx1,tx);
1235             fjy1             = _mm256_add_ps(fjy1,ty);
1236             fjz1             = _mm256_add_ps(fjz1,tz);
1237
1238             }
1239
1240             /**************************
1241              * CALCULATE INTERACTIONS *
1242              **************************/
1243
1244             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1245             {
1246
1247             r22              = _mm256_mul_ps(rsq22,rinv22);
1248             r22              = _mm256_andnot_ps(dummy_mask,r22);
1249
1250             /* EWALD ELECTROSTATICS */
1251             
1252             /* Analytical PME correction */
1253             zeta2            = _mm256_mul_ps(beta2,rsq22);
1254             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1255             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1256             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1257             felec            = _mm256_mul_ps(qq22,felec);
1258             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1259             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1260             velec            = _mm256_sub_ps(rinv22,pmecorrV);
1261             velec            = _mm256_mul_ps(qq22,velec);
1262             
1263             d                = _mm256_sub_ps(r22,rswitch);
1264             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1265             d2               = _mm256_mul_ps(d,d);
1266             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1267
1268             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1269
1270             /* Evaluate switch function */
1271             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1272             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
1273             velec            = _mm256_mul_ps(velec,sw);
1274             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1275
1276             /* Update potential sum for this i atom from the interaction with this j atom. */
1277             velec            = _mm256_and_ps(velec,cutoff_mask);
1278             velec            = _mm256_andnot_ps(dummy_mask,velec);
1279             velecsum         = _mm256_add_ps(velecsum,velec);
1280
1281             fscal            = felec;
1282
1283             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1284
1285             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1286
1287             /* Calculate temporary vectorial force */
1288             tx               = _mm256_mul_ps(fscal,dx22);
1289             ty               = _mm256_mul_ps(fscal,dy22);
1290             tz               = _mm256_mul_ps(fscal,dz22);
1291
1292             /* Update vectorial force */
1293             fix2             = _mm256_add_ps(fix2,tx);
1294             fiy2             = _mm256_add_ps(fiy2,ty);
1295             fiz2             = _mm256_add_ps(fiz2,tz);
1296
1297             fjx2             = _mm256_add_ps(fjx2,tx);
1298             fjy2             = _mm256_add_ps(fjy2,ty);
1299             fjz2             = _mm256_add_ps(fjz2,tz);
1300
1301             }
1302
1303             /**************************
1304              * CALCULATE INTERACTIONS *
1305              **************************/
1306
1307             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1308             {
1309
1310             r23              = _mm256_mul_ps(rsq23,rinv23);
1311             r23              = _mm256_andnot_ps(dummy_mask,r23);
1312
1313             /* EWALD ELECTROSTATICS */
1314             
1315             /* Analytical PME correction */
1316             zeta2            = _mm256_mul_ps(beta2,rsq23);
1317             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
1318             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1319             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1320             felec            = _mm256_mul_ps(qq23,felec);
1321             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1322             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1323             velec            = _mm256_sub_ps(rinv23,pmecorrV);
1324             velec            = _mm256_mul_ps(qq23,velec);
1325             
1326             d                = _mm256_sub_ps(r23,rswitch);
1327             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1328             d2               = _mm256_mul_ps(d,d);
1329             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1330
1331             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1332
1333             /* Evaluate switch function */
1334             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1335             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
1336             velec            = _mm256_mul_ps(velec,sw);
1337             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1338
1339             /* Update potential sum for this i atom from the interaction with this j atom. */
1340             velec            = _mm256_and_ps(velec,cutoff_mask);
1341             velec            = _mm256_andnot_ps(dummy_mask,velec);
1342             velecsum         = _mm256_add_ps(velecsum,velec);
1343
1344             fscal            = felec;
1345
1346             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1347
1348             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1349
1350             /* Calculate temporary vectorial force */
1351             tx               = _mm256_mul_ps(fscal,dx23);
1352             ty               = _mm256_mul_ps(fscal,dy23);
1353             tz               = _mm256_mul_ps(fscal,dz23);
1354
1355             /* Update vectorial force */
1356             fix2             = _mm256_add_ps(fix2,tx);
1357             fiy2             = _mm256_add_ps(fiy2,ty);
1358             fiz2             = _mm256_add_ps(fiz2,tz);
1359
1360             fjx3             = _mm256_add_ps(fjx3,tx);
1361             fjy3             = _mm256_add_ps(fjy3,ty);
1362             fjz3             = _mm256_add_ps(fjz3,tz);
1363
1364             }
1365
1366             /**************************
1367              * CALCULATE INTERACTIONS *
1368              **************************/
1369
1370             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1371             {
1372
1373             r31              = _mm256_mul_ps(rsq31,rinv31);
1374             r31              = _mm256_andnot_ps(dummy_mask,r31);
1375
1376             /* EWALD ELECTROSTATICS */
1377             
1378             /* Analytical PME correction */
1379             zeta2            = _mm256_mul_ps(beta2,rsq31);
1380             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
1381             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1382             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1383             felec            = _mm256_mul_ps(qq31,felec);
1384             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1385             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1386             velec            = _mm256_sub_ps(rinv31,pmecorrV);
1387             velec            = _mm256_mul_ps(qq31,velec);
1388             
1389             d                = _mm256_sub_ps(r31,rswitch);
1390             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1391             d2               = _mm256_mul_ps(d,d);
1392             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1393
1394             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1395
1396             /* Evaluate switch function */
1397             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1398             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
1399             velec            = _mm256_mul_ps(velec,sw);
1400             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1401
1402             /* Update potential sum for this i atom from the interaction with this j atom. */
1403             velec            = _mm256_and_ps(velec,cutoff_mask);
1404             velec            = _mm256_andnot_ps(dummy_mask,velec);
1405             velecsum         = _mm256_add_ps(velecsum,velec);
1406
1407             fscal            = felec;
1408
1409             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1410
1411             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1412
1413             /* Calculate temporary vectorial force */
1414             tx               = _mm256_mul_ps(fscal,dx31);
1415             ty               = _mm256_mul_ps(fscal,dy31);
1416             tz               = _mm256_mul_ps(fscal,dz31);
1417
1418             /* Update vectorial force */
1419             fix3             = _mm256_add_ps(fix3,tx);
1420             fiy3             = _mm256_add_ps(fiy3,ty);
1421             fiz3             = _mm256_add_ps(fiz3,tz);
1422
1423             fjx1             = _mm256_add_ps(fjx1,tx);
1424             fjy1             = _mm256_add_ps(fjy1,ty);
1425             fjz1             = _mm256_add_ps(fjz1,tz);
1426
1427             }
1428
1429             /**************************
1430              * CALCULATE INTERACTIONS *
1431              **************************/
1432
1433             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1434             {
1435
1436             r32              = _mm256_mul_ps(rsq32,rinv32);
1437             r32              = _mm256_andnot_ps(dummy_mask,r32);
1438
1439             /* EWALD ELECTROSTATICS */
1440             
1441             /* Analytical PME correction */
1442             zeta2            = _mm256_mul_ps(beta2,rsq32);
1443             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
1444             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1445             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1446             felec            = _mm256_mul_ps(qq32,felec);
1447             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1448             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1449             velec            = _mm256_sub_ps(rinv32,pmecorrV);
1450             velec            = _mm256_mul_ps(qq32,velec);
1451             
1452             d                = _mm256_sub_ps(r32,rswitch);
1453             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1454             d2               = _mm256_mul_ps(d,d);
1455             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1456
1457             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1458
1459             /* Evaluate switch function */
1460             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1461             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
1462             velec            = _mm256_mul_ps(velec,sw);
1463             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1464
1465             /* Update potential sum for this i atom from the interaction with this j atom. */
1466             velec            = _mm256_and_ps(velec,cutoff_mask);
1467             velec            = _mm256_andnot_ps(dummy_mask,velec);
1468             velecsum         = _mm256_add_ps(velecsum,velec);
1469
1470             fscal            = felec;
1471
1472             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1473
1474             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1475
1476             /* Calculate temporary vectorial force */
1477             tx               = _mm256_mul_ps(fscal,dx32);
1478             ty               = _mm256_mul_ps(fscal,dy32);
1479             tz               = _mm256_mul_ps(fscal,dz32);
1480
1481             /* Update vectorial force */
1482             fix3             = _mm256_add_ps(fix3,tx);
1483             fiy3             = _mm256_add_ps(fiy3,ty);
1484             fiz3             = _mm256_add_ps(fiz3,tz);
1485
1486             fjx2             = _mm256_add_ps(fjx2,tx);
1487             fjy2             = _mm256_add_ps(fjy2,ty);
1488             fjz2             = _mm256_add_ps(fjz2,tz);
1489
1490             }
1491
1492             /**************************
1493              * CALCULATE INTERACTIONS *
1494              **************************/
1495
1496             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1497             {
1498
1499             r33              = _mm256_mul_ps(rsq33,rinv33);
1500             r33              = _mm256_andnot_ps(dummy_mask,r33);
1501
1502             /* EWALD ELECTROSTATICS */
1503             
1504             /* Analytical PME correction */
1505             zeta2            = _mm256_mul_ps(beta2,rsq33);
1506             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
1507             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1508             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1509             felec            = _mm256_mul_ps(qq33,felec);
1510             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1511             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1512             velec            = _mm256_sub_ps(rinv33,pmecorrV);
1513             velec            = _mm256_mul_ps(qq33,velec);
1514             
1515             d                = _mm256_sub_ps(r33,rswitch);
1516             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1517             d2               = _mm256_mul_ps(d,d);
1518             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1519
1520             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1521
1522             /* Evaluate switch function */
1523             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1524             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
1525             velec            = _mm256_mul_ps(velec,sw);
1526             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1527
1528             /* Update potential sum for this i atom from the interaction with this j atom. */
1529             velec            = _mm256_and_ps(velec,cutoff_mask);
1530             velec            = _mm256_andnot_ps(dummy_mask,velec);
1531             velecsum         = _mm256_add_ps(velecsum,velec);
1532
1533             fscal            = felec;
1534
1535             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1536
1537             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1538
1539             /* Calculate temporary vectorial force */
1540             tx               = _mm256_mul_ps(fscal,dx33);
1541             ty               = _mm256_mul_ps(fscal,dy33);
1542             tz               = _mm256_mul_ps(fscal,dz33);
1543
1544             /* Update vectorial force */
1545             fix3             = _mm256_add_ps(fix3,tx);
1546             fiy3             = _mm256_add_ps(fiy3,ty);
1547             fiz3             = _mm256_add_ps(fiz3,tz);
1548
1549             fjx3             = _mm256_add_ps(fjx3,tx);
1550             fjy3             = _mm256_add_ps(fjy3,ty);
1551             fjz3             = _mm256_add_ps(fjz3,tz);
1552
1553             }
1554
1555             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1556             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1557             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1558             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1559             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1560             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1561             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1562             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1563
1564             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1565                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1566                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1567
1568             /* Inner loop uses 981 flops */
1569         }
1570
1571         /* End of innermost loop */
1572
1573         gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1574                                                  f+i_coord_offset+DIM,fshift+i_shift_offset);
1575
1576         ggid                        = gid[iidx];
1577         /* Update potential energies */
1578         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1579
1580         /* Increment number of inner iterations */
1581         inneriter                  += j_index_end - j_index_start;
1582
1583         /* Outer loop uses 19 flops */
1584     }
1585
1586     /* Increment number of outer iterations */
1587     outeriter        += nri;
1588
1589     /* Update outer/inner flops */
1590
1591     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*981);
1592 }
1593 /*
1594  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single
1595  * Electrostatics interaction: Ewald
1596  * VdW interaction:            None
1597  * Geometry:                   Water4-Water4
1598  * Calculate force/pot:        Force
1599  */
1600 void
1601 nb_kernel_ElecEwSw_VdwNone_GeomW4W4_F_avx_256_single
1602                     (t_nblist                    * gmx_restrict       nlist,
1603                      rvec                        * gmx_restrict          xx,
1604                      rvec                        * gmx_restrict          ff,
1605                      t_forcerec                  * gmx_restrict          fr,
1606                      t_mdatoms                   * gmx_restrict     mdatoms,
1607                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1608                      t_nrnb                      * gmx_restrict        nrnb)
1609 {
1610     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1611      * just 0 for non-waters.
1612      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1613      * jnr indices corresponding to data put in the four positions in the SIMD register.
1614      */
1615     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1616     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1617     int              jnrA,jnrB,jnrC,jnrD;
1618     int              jnrE,jnrF,jnrG,jnrH;
1619     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1620     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1621     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1622     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1623     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1624     real             rcutoff_scalar;
1625     real             *shiftvec,*fshift,*x,*f;
1626     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1627     real             scratch[4*DIM];
1628     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1629     real *           vdwioffsetptr1;
1630     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1631     real *           vdwioffsetptr2;
1632     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1633     real *           vdwioffsetptr3;
1634     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1635     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1636     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1637     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1638     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1639     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1640     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1641     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1642     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1643     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1644     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1645     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1646     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1647     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1648     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1649     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1650     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1651     real             *charge;
1652     __m256i          ewitab;
1653     __m128i          ewitab_lo,ewitab_hi;
1654     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1655     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1656     real             *ewtab;
1657     __m256           rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
1658     real             rswitch_scalar,d_scalar;
1659     __m256           dummy_mask,cutoff_mask;
1660     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1661     __m256           one     = _mm256_set1_ps(1.0);
1662     __m256           two     = _mm256_set1_ps(2.0);
1663     x                = xx[0];
1664     f                = ff[0];
1665
1666     nri              = nlist->nri;
1667     iinr             = nlist->iinr;
1668     jindex           = nlist->jindex;
1669     jjnr             = nlist->jjnr;
1670     shiftidx         = nlist->shift;
1671     gid              = nlist->gid;
1672     shiftvec         = fr->shift_vec[0];
1673     fshift           = fr->fshift[0];
1674     facel            = _mm256_set1_ps(fr->epsfac);
1675     charge           = mdatoms->chargeA;
1676
1677     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1678     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1679     beta2            = _mm256_mul_ps(beta,beta);
1680     beta3            = _mm256_mul_ps(beta,beta2);
1681
1682     ewtab            = fr->ic->tabq_coul_FDV0;
1683     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1684     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1685
1686     /* Setup water-specific parameters */
1687     inr              = nlist->iinr[0];
1688     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1689     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1690     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1691
1692     jq1              = _mm256_set1_ps(charge[inr+1]);
1693     jq2              = _mm256_set1_ps(charge[inr+2]);
1694     jq3              = _mm256_set1_ps(charge[inr+3]);
1695     qq11             = _mm256_mul_ps(iq1,jq1);
1696     qq12             = _mm256_mul_ps(iq1,jq2);
1697     qq13             = _mm256_mul_ps(iq1,jq3);
1698     qq21             = _mm256_mul_ps(iq2,jq1);
1699     qq22             = _mm256_mul_ps(iq2,jq2);
1700     qq23             = _mm256_mul_ps(iq2,jq3);
1701     qq31             = _mm256_mul_ps(iq3,jq1);
1702     qq32             = _mm256_mul_ps(iq3,jq2);
1703     qq33             = _mm256_mul_ps(iq3,jq3);
1704
1705     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1706     rcutoff_scalar   = fr->rcoulomb;
1707     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1708     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1709
1710     rswitch_scalar   = fr->rcoulomb_switch;
1711     rswitch          = _mm256_set1_ps(rswitch_scalar);
1712     /* Setup switch parameters */
1713     d_scalar         = rcutoff_scalar-rswitch_scalar;
1714     d                = _mm256_set1_ps(d_scalar);
1715     swV3             = _mm256_set1_ps(-10.0/(d_scalar*d_scalar*d_scalar));
1716     swV4             = _mm256_set1_ps( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1717     swV5             = _mm256_set1_ps( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1718     swF2             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar));
1719     swF3             = _mm256_set1_ps( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
1720     swF4             = _mm256_set1_ps(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
1721
1722     /* Avoid stupid compiler warnings */
1723     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1724     j_coord_offsetA = 0;
1725     j_coord_offsetB = 0;
1726     j_coord_offsetC = 0;
1727     j_coord_offsetD = 0;
1728     j_coord_offsetE = 0;
1729     j_coord_offsetF = 0;
1730     j_coord_offsetG = 0;
1731     j_coord_offsetH = 0;
1732
1733     outeriter        = 0;
1734     inneriter        = 0;
1735
1736     for(iidx=0;iidx<4*DIM;iidx++)
1737     {
1738         scratch[iidx] = 0.0;
1739     }
1740
1741     /* Start outer loop over neighborlists */
1742     for(iidx=0; iidx<nri; iidx++)
1743     {
1744         /* Load shift vector for this list */
1745         i_shift_offset   = DIM*shiftidx[iidx];
1746
1747         /* Load limits for loop over neighbors */
1748         j_index_start    = jindex[iidx];
1749         j_index_end      = jindex[iidx+1];
1750
1751         /* Get outer coordinate index */
1752         inr              = iinr[iidx];
1753         i_coord_offset   = DIM*inr;
1754
1755         /* Load i particle coords and add shift vector */
1756         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1757                                                     &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1758
1759         fix1             = _mm256_setzero_ps();
1760         fiy1             = _mm256_setzero_ps();
1761         fiz1             = _mm256_setzero_ps();
1762         fix2             = _mm256_setzero_ps();
1763         fiy2             = _mm256_setzero_ps();
1764         fiz2             = _mm256_setzero_ps();
1765         fix3             = _mm256_setzero_ps();
1766         fiy3             = _mm256_setzero_ps();
1767         fiz3             = _mm256_setzero_ps();
1768
1769         /* Start inner kernel loop */
1770         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1771         {
1772
1773             /* Get j neighbor index, and coordinate index */
1774             jnrA             = jjnr[jidx];
1775             jnrB             = jjnr[jidx+1];
1776             jnrC             = jjnr[jidx+2];
1777             jnrD             = jjnr[jidx+3];
1778             jnrE             = jjnr[jidx+4];
1779             jnrF             = jjnr[jidx+5];
1780             jnrG             = jjnr[jidx+6];
1781             jnrH             = jjnr[jidx+7];
1782             j_coord_offsetA  = DIM*jnrA;
1783             j_coord_offsetB  = DIM*jnrB;
1784             j_coord_offsetC  = DIM*jnrC;
1785             j_coord_offsetD  = DIM*jnrD;
1786             j_coord_offsetE  = DIM*jnrE;
1787             j_coord_offsetF  = DIM*jnrF;
1788             j_coord_offsetG  = DIM*jnrG;
1789             j_coord_offsetH  = DIM*jnrH;
1790
1791             /* load j atom coordinates */
1792             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1793                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1794                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1795                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1796                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1797
1798             /* Calculate displacement vector */
1799             dx11             = _mm256_sub_ps(ix1,jx1);
1800             dy11             = _mm256_sub_ps(iy1,jy1);
1801             dz11             = _mm256_sub_ps(iz1,jz1);
1802             dx12             = _mm256_sub_ps(ix1,jx2);
1803             dy12             = _mm256_sub_ps(iy1,jy2);
1804             dz12             = _mm256_sub_ps(iz1,jz2);
1805             dx13             = _mm256_sub_ps(ix1,jx3);
1806             dy13             = _mm256_sub_ps(iy1,jy3);
1807             dz13             = _mm256_sub_ps(iz1,jz3);
1808             dx21             = _mm256_sub_ps(ix2,jx1);
1809             dy21             = _mm256_sub_ps(iy2,jy1);
1810             dz21             = _mm256_sub_ps(iz2,jz1);
1811             dx22             = _mm256_sub_ps(ix2,jx2);
1812             dy22             = _mm256_sub_ps(iy2,jy2);
1813             dz22             = _mm256_sub_ps(iz2,jz2);
1814             dx23             = _mm256_sub_ps(ix2,jx3);
1815             dy23             = _mm256_sub_ps(iy2,jy3);
1816             dz23             = _mm256_sub_ps(iz2,jz3);
1817             dx31             = _mm256_sub_ps(ix3,jx1);
1818             dy31             = _mm256_sub_ps(iy3,jy1);
1819             dz31             = _mm256_sub_ps(iz3,jz1);
1820             dx32             = _mm256_sub_ps(ix3,jx2);
1821             dy32             = _mm256_sub_ps(iy3,jy2);
1822             dz32             = _mm256_sub_ps(iz3,jz2);
1823             dx33             = _mm256_sub_ps(ix3,jx3);
1824             dy33             = _mm256_sub_ps(iy3,jy3);
1825             dz33             = _mm256_sub_ps(iz3,jz3);
1826
1827             /* Calculate squared distance and things based on it */
1828             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1829             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1830             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1831             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1832             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1833             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1834             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1835             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1836             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1837
1838             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1839             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1840             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1841             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1842             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1843             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1844             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1845             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1846             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1847
1848             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1849             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1850             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1851             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1852             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1853             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1854             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1855             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1856             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1857
1858             fjx1             = _mm256_setzero_ps();
1859             fjy1             = _mm256_setzero_ps();
1860             fjz1             = _mm256_setzero_ps();
1861             fjx2             = _mm256_setzero_ps();
1862             fjy2             = _mm256_setzero_ps();
1863             fjz2             = _mm256_setzero_ps();
1864             fjx3             = _mm256_setzero_ps();
1865             fjy3             = _mm256_setzero_ps();
1866             fjz3             = _mm256_setzero_ps();
1867
1868             /**************************
1869              * CALCULATE INTERACTIONS *
1870              **************************/
1871
1872             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1873             {
1874
1875             r11              = _mm256_mul_ps(rsq11,rinv11);
1876
1877             /* EWALD ELECTROSTATICS */
1878             
1879             /* Analytical PME correction */
1880             zeta2            = _mm256_mul_ps(beta2,rsq11);
1881             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1882             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1883             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1884             felec            = _mm256_mul_ps(qq11,felec);
1885             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1886             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1887             velec            = _mm256_sub_ps(rinv11,pmecorrV);
1888             velec            = _mm256_mul_ps(qq11,velec);
1889             
1890             d                = _mm256_sub_ps(r11,rswitch);
1891             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1892             d2               = _mm256_mul_ps(d,d);
1893             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1894
1895             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1896
1897             /* Evaluate switch function */
1898             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1899             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
1900             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1901
1902             fscal            = felec;
1903
1904             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1905
1906             /* Calculate temporary vectorial force */
1907             tx               = _mm256_mul_ps(fscal,dx11);
1908             ty               = _mm256_mul_ps(fscal,dy11);
1909             tz               = _mm256_mul_ps(fscal,dz11);
1910
1911             /* Update vectorial force */
1912             fix1             = _mm256_add_ps(fix1,tx);
1913             fiy1             = _mm256_add_ps(fiy1,ty);
1914             fiz1             = _mm256_add_ps(fiz1,tz);
1915
1916             fjx1             = _mm256_add_ps(fjx1,tx);
1917             fjy1             = _mm256_add_ps(fjy1,ty);
1918             fjz1             = _mm256_add_ps(fjz1,tz);
1919
1920             }
1921
1922             /**************************
1923              * CALCULATE INTERACTIONS *
1924              **************************/
1925
1926             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1927             {
1928
1929             r12              = _mm256_mul_ps(rsq12,rinv12);
1930
1931             /* EWALD ELECTROSTATICS */
1932             
1933             /* Analytical PME correction */
1934             zeta2            = _mm256_mul_ps(beta2,rsq12);
1935             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1936             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1937             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1938             felec            = _mm256_mul_ps(qq12,felec);
1939             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1940             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1941             velec            = _mm256_sub_ps(rinv12,pmecorrV);
1942             velec            = _mm256_mul_ps(qq12,velec);
1943             
1944             d                = _mm256_sub_ps(r12,rswitch);
1945             d                = _mm256_max_ps(d,_mm256_setzero_ps());
1946             d2               = _mm256_mul_ps(d,d);
1947             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
1948
1949             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
1950
1951             /* Evaluate switch function */
1952             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1953             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
1954             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1955
1956             fscal            = felec;
1957
1958             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1959
1960             /* Calculate temporary vectorial force */
1961             tx               = _mm256_mul_ps(fscal,dx12);
1962             ty               = _mm256_mul_ps(fscal,dy12);
1963             tz               = _mm256_mul_ps(fscal,dz12);
1964
1965             /* Update vectorial force */
1966             fix1             = _mm256_add_ps(fix1,tx);
1967             fiy1             = _mm256_add_ps(fiy1,ty);
1968             fiz1             = _mm256_add_ps(fiz1,tz);
1969
1970             fjx2             = _mm256_add_ps(fjx2,tx);
1971             fjy2             = _mm256_add_ps(fjy2,ty);
1972             fjz2             = _mm256_add_ps(fjz2,tz);
1973
1974             }
1975
1976             /**************************
1977              * CALCULATE INTERACTIONS *
1978              **************************/
1979
1980             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1981             {
1982
1983             r13              = _mm256_mul_ps(rsq13,rinv13);
1984
1985             /* EWALD ELECTROSTATICS */
1986             
1987             /* Analytical PME correction */
1988             zeta2            = _mm256_mul_ps(beta2,rsq13);
1989             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
1990             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1991             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1992             felec            = _mm256_mul_ps(qq13,felec);
1993             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1994             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1995             velec            = _mm256_sub_ps(rinv13,pmecorrV);
1996             velec            = _mm256_mul_ps(qq13,velec);
1997             
1998             d                = _mm256_sub_ps(r13,rswitch);
1999             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2000             d2               = _mm256_mul_ps(d,d);
2001             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2002
2003             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2004
2005             /* Evaluate switch function */
2006             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2007             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2008             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2009
2010             fscal            = felec;
2011
2012             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2013
2014             /* Calculate temporary vectorial force */
2015             tx               = _mm256_mul_ps(fscal,dx13);
2016             ty               = _mm256_mul_ps(fscal,dy13);
2017             tz               = _mm256_mul_ps(fscal,dz13);
2018
2019             /* Update vectorial force */
2020             fix1             = _mm256_add_ps(fix1,tx);
2021             fiy1             = _mm256_add_ps(fiy1,ty);
2022             fiz1             = _mm256_add_ps(fiz1,tz);
2023
2024             fjx3             = _mm256_add_ps(fjx3,tx);
2025             fjy3             = _mm256_add_ps(fjy3,ty);
2026             fjz3             = _mm256_add_ps(fjz3,tz);
2027
2028             }
2029
2030             /**************************
2031              * CALCULATE INTERACTIONS *
2032              **************************/
2033
2034             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2035             {
2036
2037             r21              = _mm256_mul_ps(rsq21,rinv21);
2038
2039             /* EWALD ELECTROSTATICS */
2040             
2041             /* Analytical PME correction */
2042             zeta2            = _mm256_mul_ps(beta2,rsq21);
2043             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2044             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2045             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2046             felec            = _mm256_mul_ps(qq21,felec);
2047             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2048             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2049             velec            = _mm256_sub_ps(rinv21,pmecorrV);
2050             velec            = _mm256_mul_ps(qq21,velec);
2051             
2052             d                = _mm256_sub_ps(r21,rswitch);
2053             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2054             d2               = _mm256_mul_ps(d,d);
2055             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2056
2057             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2058
2059             /* Evaluate switch function */
2060             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2061             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2062             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2063
2064             fscal            = felec;
2065
2066             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2067
2068             /* Calculate temporary vectorial force */
2069             tx               = _mm256_mul_ps(fscal,dx21);
2070             ty               = _mm256_mul_ps(fscal,dy21);
2071             tz               = _mm256_mul_ps(fscal,dz21);
2072
2073             /* Update vectorial force */
2074             fix2             = _mm256_add_ps(fix2,tx);
2075             fiy2             = _mm256_add_ps(fiy2,ty);
2076             fiz2             = _mm256_add_ps(fiz2,tz);
2077
2078             fjx1             = _mm256_add_ps(fjx1,tx);
2079             fjy1             = _mm256_add_ps(fjy1,ty);
2080             fjz1             = _mm256_add_ps(fjz1,tz);
2081
2082             }
2083
2084             /**************************
2085              * CALCULATE INTERACTIONS *
2086              **************************/
2087
2088             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2089             {
2090
2091             r22              = _mm256_mul_ps(rsq22,rinv22);
2092
2093             /* EWALD ELECTROSTATICS */
2094             
2095             /* Analytical PME correction */
2096             zeta2            = _mm256_mul_ps(beta2,rsq22);
2097             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2098             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2099             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2100             felec            = _mm256_mul_ps(qq22,felec);
2101             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2102             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2103             velec            = _mm256_sub_ps(rinv22,pmecorrV);
2104             velec            = _mm256_mul_ps(qq22,velec);
2105             
2106             d                = _mm256_sub_ps(r22,rswitch);
2107             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2108             d2               = _mm256_mul_ps(d,d);
2109             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2110
2111             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2112
2113             /* Evaluate switch function */
2114             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2115             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2116             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2117
2118             fscal            = felec;
2119
2120             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2121
2122             /* Calculate temporary vectorial force */
2123             tx               = _mm256_mul_ps(fscal,dx22);
2124             ty               = _mm256_mul_ps(fscal,dy22);
2125             tz               = _mm256_mul_ps(fscal,dz22);
2126
2127             /* Update vectorial force */
2128             fix2             = _mm256_add_ps(fix2,tx);
2129             fiy2             = _mm256_add_ps(fiy2,ty);
2130             fiz2             = _mm256_add_ps(fiz2,tz);
2131
2132             fjx2             = _mm256_add_ps(fjx2,tx);
2133             fjy2             = _mm256_add_ps(fjy2,ty);
2134             fjz2             = _mm256_add_ps(fjz2,tz);
2135
2136             }
2137
2138             /**************************
2139              * CALCULATE INTERACTIONS *
2140              **************************/
2141
2142             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2143             {
2144
2145             r23              = _mm256_mul_ps(rsq23,rinv23);
2146
2147             /* EWALD ELECTROSTATICS */
2148             
2149             /* Analytical PME correction */
2150             zeta2            = _mm256_mul_ps(beta2,rsq23);
2151             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
2152             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2153             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2154             felec            = _mm256_mul_ps(qq23,felec);
2155             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2156             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2157             velec            = _mm256_sub_ps(rinv23,pmecorrV);
2158             velec            = _mm256_mul_ps(qq23,velec);
2159             
2160             d                = _mm256_sub_ps(r23,rswitch);
2161             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2162             d2               = _mm256_mul_ps(d,d);
2163             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2164
2165             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2166
2167             /* Evaluate switch function */
2168             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2169             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2170             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2171
2172             fscal            = felec;
2173
2174             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2175
2176             /* Calculate temporary vectorial force */
2177             tx               = _mm256_mul_ps(fscal,dx23);
2178             ty               = _mm256_mul_ps(fscal,dy23);
2179             tz               = _mm256_mul_ps(fscal,dz23);
2180
2181             /* Update vectorial force */
2182             fix2             = _mm256_add_ps(fix2,tx);
2183             fiy2             = _mm256_add_ps(fiy2,ty);
2184             fiz2             = _mm256_add_ps(fiz2,tz);
2185
2186             fjx3             = _mm256_add_ps(fjx3,tx);
2187             fjy3             = _mm256_add_ps(fjy3,ty);
2188             fjz3             = _mm256_add_ps(fjz3,tz);
2189
2190             }
2191
2192             /**************************
2193              * CALCULATE INTERACTIONS *
2194              **************************/
2195
2196             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2197             {
2198
2199             r31              = _mm256_mul_ps(rsq31,rinv31);
2200
2201             /* EWALD ELECTROSTATICS */
2202             
2203             /* Analytical PME correction */
2204             zeta2            = _mm256_mul_ps(beta2,rsq31);
2205             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
2206             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2207             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2208             felec            = _mm256_mul_ps(qq31,felec);
2209             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2210             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2211             velec            = _mm256_sub_ps(rinv31,pmecorrV);
2212             velec            = _mm256_mul_ps(qq31,velec);
2213             
2214             d                = _mm256_sub_ps(r31,rswitch);
2215             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2216             d2               = _mm256_mul_ps(d,d);
2217             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2218
2219             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2220
2221             /* Evaluate switch function */
2222             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2223             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2224             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2225
2226             fscal            = felec;
2227
2228             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2229
2230             /* Calculate temporary vectorial force */
2231             tx               = _mm256_mul_ps(fscal,dx31);
2232             ty               = _mm256_mul_ps(fscal,dy31);
2233             tz               = _mm256_mul_ps(fscal,dz31);
2234
2235             /* Update vectorial force */
2236             fix3             = _mm256_add_ps(fix3,tx);
2237             fiy3             = _mm256_add_ps(fiy3,ty);
2238             fiz3             = _mm256_add_ps(fiz3,tz);
2239
2240             fjx1             = _mm256_add_ps(fjx1,tx);
2241             fjy1             = _mm256_add_ps(fjy1,ty);
2242             fjz1             = _mm256_add_ps(fjz1,tz);
2243
2244             }
2245
2246             /**************************
2247              * CALCULATE INTERACTIONS *
2248              **************************/
2249
2250             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2251             {
2252
2253             r32              = _mm256_mul_ps(rsq32,rinv32);
2254
2255             /* EWALD ELECTROSTATICS */
2256             
2257             /* Analytical PME correction */
2258             zeta2            = _mm256_mul_ps(beta2,rsq32);
2259             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
2260             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2261             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2262             felec            = _mm256_mul_ps(qq32,felec);
2263             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2264             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2265             velec            = _mm256_sub_ps(rinv32,pmecorrV);
2266             velec            = _mm256_mul_ps(qq32,velec);
2267             
2268             d                = _mm256_sub_ps(r32,rswitch);
2269             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2270             d2               = _mm256_mul_ps(d,d);
2271             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2272
2273             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2274
2275             /* Evaluate switch function */
2276             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2277             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2278             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2279
2280             fscal            = felec;
2281
2282             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2283
2284             /* Calculate temporary vectorial force */
2285             tx               = _mm256_mul_ps(fscal,dx32);
2286             ty               = _mm256_mul_ps(fscal,dy32);
2287             tz               = _mm256_mul_ps(fscal,dz32);
2288
2289             /* Update vectorial force */
2290             fix3             = _mm256_add_ps(fix3,tx);
2291             fiy3             = _mm256_add_ps(fiy3,ty);
2292             fiz3             = _mm256_add_ps(fiz3,tz);
2293
2294             fjx2             = _mm256_add_ps(fjx2,tx);
2295             fjy2             = _mm256_add_ps(fjy2,ty);
2296             fjz2             = _mm256_add_ps(fjz2,tz);
2297
2298             }
2299
2300             /**************************
2301              * CALCULATE INTERACTIONS *
2302              **************************/
2303
2304             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2305             {
2306
2307             r33              = _mm256_mul_ps(rsq33,rinv33);
2308
2309             /* EWALD ELECTROSTATICS */
2310             
2311             /* Analytical PME correction */
2312             zeta2            = _mm256_mul_ps(beta2,rsq33);
2313             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
2314             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2315             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2316             felec            = _mm256_mul_ps(qq33,felec);
2317             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2318             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2319             velec            = _mm256_sub_ps(rinv33,pmecorrV);
2320             velec            = _mm256_mul_ps(qq33,velec);
2321             
2322             d                = _mm256_sub_ps(r33,rswitch);
2323             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2324             d2               = _mm256_mul_ps(d,d);
2325             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2326
2327             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2328
2329             /* Evaluate switch function */
2330             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2331             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2332             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2333
2334             fscal            = felec;
2335
2336             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2337
2338             /* Calculate temporary vectorial force */
2339             tx               = _mm256_mul_ps(fscal,dx33);
2340             ty               = _mm256_mul_ps(fscal,dy33);
2341             tz               = _mm256_mul_ps(fscal,dz33);
2342
2343             /* Update vectorial force */
2344             fix3             = _mm256_add_ps(fix3,tx);
2345             fiy3             = _mm256_add_ps(fiy3,ty);
2346             fiz3             = _mm256_add_ps(fiz3,tz);
2347
2348             fjx3             = _mm256_add_ps(fjx3,tx);
2349             fjy3             = _mm256_add_ps(fjy3,ty);
2350             fjz3             = _mm256_add_ps(fjz3,tz);
2351
2352             }
2353
2354             fjptrA             = f+j_coord_offsetA;
2355             fjptrB             = f+j_coord_offsetB;
2356             fjptrC             = f+j_coord_offsetC;
2357             fjptrD             = f+j_coord_offsetD;
2358             fjptrE             = f+j_coord_offsetE;
2359             fjptrF             = f+j_coord_offsetF;
2360             fjptrG             = f+j_coord_offsetG;
2361             fjptrH             = f+j_coord_offsetH;
2362
2363             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2364                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2365                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2366
2367             /* Inner loop uses 945 flops */
2368         }
2369
2370         if(jidx<j_index_end)
2371         {
2372
2373             /* Get j neighbor index, and coordinate index */
2374             jnrlistA         = jjnr[jidx];
2375             jnrlistB         = jjnr[jidx+1];
2376             jnrlistC         = jjnr[jidx+2];
2377             jnrlistD         = jjnr[jidx+3];
2378             jnrlistE         = jjnr[jidx+4];
2379             jnrlistF         = jjnr[jidx+5];
2380             jnrlistG         = jjnr[jidx+6];
2381             jnrlistH         = jjnr[jidx+7];
2382             /* Sign of each element will be negative for non-real atoms.
2383              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2384              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2385              */
2386             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2387                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2388                                             
2389             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2390             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2391             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2392             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2393             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2394             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2395             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2396             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2397             j_coord_offsetA  = DIM*jnrA;
2398             j_coord_offsetB  = DIM*jnrB;
2399             j_coord_offsetC  = DIM*jnrC;
2400             j_coord_offsetD  = DIM*jnrD;
2401             j_coord_offsetE  = DIM*jnrE;
2402             j_coord_offsetF  = DIM*jnrF;
2403             j_coord_offsetG  = DIM*jnrG;
2404             j_coord_offsetH  = DIM*jnrH;
2405
2406             /* load j atom coordinates */
2407             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2408                                                  x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2409                                                  x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
2410                                                  x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
2411                                                  &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2412
2413             /* Calculate displacement vector */
2414             dx11             = _mm256_sub_ps(ix1,jx1);
2415             dy11             = _mm256_sub_ps(iy1,jy1);
2416             dz11             = _mm256_sub_ps(iz1,jz1);
2417             dx12             = _mm256_sub_ps(ix1,jx2);
2418             dy12             = _mm256_sub_ps(iy1,jy2);
2419             dz12             = _mm256_sub_ps(iz1,jz2);
2420             dx13             = _mm256_sub_ps(ix1,jx3);
2421             dy13             = _mm256_sub_ps(iy1,jy3);
2422             dz13             = _mm256_sub_ps(iz1,jz3);
2423             dx21             = _mm256_sub_ps(ix2,jx1);
2424             dy21             = _mm256_sub_ps(iy2,jy1);
2425             dz21             = _mm256_sub_ps(iz2,jz1);
2426             dx22             = _mm256_sub_ps(ix2,jx2);
2427             dy22             = _mm256_sub_ps(iy2,jy2);
2428             dz22             = _mm256_sub_ps(iz2,jz2);
2429             dx23             = _mm256_sub_ps(ix2,jx3);
2430             dy23             = _mm256_sub_ps(iy2,jy3);
2431             dz23             = _mm256_sub_ps(iz2,jz3);
2432             dx31             = _mm256_sub_ps(ix3,jx1);
2433             dy31             = _mm256_sub_ps(iy3,jy1);
2434             dz31             = _mm256_sub_ps(iz3,jz1);
2435             dx32             = _mm256_sub_ps(ix3,jx2);
2436             dy32             = _mm256_sub_ps(iy3,jy2);
2437             dz32             = _mm256_sub_ps(iz3,jz2);
2438             dx33             = _mm256_sub_ps(ix3,jx3);
2439             dy33             = _mm256_sub_ps(iy3,jy3);
2440             dz33             = _mm256_sub_ps(iz3,jz3);
2441
2442             /* Calculate squared distance and things based on it */
2443             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2444             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2445             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2446             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2447             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2448             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2449             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2450             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2451             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2452
2453             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2454             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2455             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
2456             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2457             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2458             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
2459             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
2460             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
2461             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
2462
2463             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2464             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2465             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
2466             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2467             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2468             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
2469             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
2470             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
2471             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
2472
2473             fjx1             = _mm256_setzero_ps();
2474             fjy1             = _mm256_setzero_ps();
2475             fjz1             = _mm256_setzero_ps();
2476             fjx2             = _mm256_setzero_ps();
2477             fjy2             = _mm256_setzero_ps();
2478             fjz2             = _mm256_setzero_ps();
2479             fjx3             = _mm256_setzero_ps();
2480             fjy3             = _mm256_setzero_ps();
2481             fjz3             = _mm256_setzero_ps();
2482
2483             /**************************
2484              * CALCULATE INTERACTIONS *
2485              **************************/
2486
2487             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2488             {
2489
2490             r11              = _mm256_mul_ps(rsq11,rinv11);
2491             r11              = _mm256_andnot_ps(dummy_mask,r11);
2492
2493             /* EWALD ELECTROSTATICS */
2494             
2495             /* Analytical PME correction */
2496             zeta2            = _mm256_mul_ps(beta2,rsq11);
2497             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2498             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2499             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2500             felec            = _mm256_mul_ps(qq11,felec);
2501             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2502             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2503             velec            = _mm256_sub_ps(rinv11,pmecorrV);
2504             velec            = _mm256_mul_ps(qq11,velec);
2505             
2506             d                = _mm256_sub_ps(r11,rswitch);
2507             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2508             d2               = _mm256_mul_ps(d,d);
2509             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2510
2511             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2512
2513             /* Evaluate switch function */
2514             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2515             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv11,_mm256_mul_ps(velec,dsw)) );
2516             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2517
2518             fscal            = felec;
2519
2520             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2521
2522             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2523
2524             /* Calculate temporary vectorial force */
2525             tx               = _mm256_mul_ps(fscal,dx11);
2526             ty               = _mm256_mul_ps(fscal,dy11);
2527             tz               = _mm256_mul_ps(fscal,dz11);
2528
2529             /* Update vectorial force */
2530             fix1             = _mm256_add_ps(fix1,tx);
2531             fiy1             = _mm256_add_ps(fiy1,ty);
2532             fiz1             = _mm256_add_ps(fiz1,tz);
2533
2534             fjx1             = _mm256_add_ps(fjx1,tx);
2535             fjy1             = _mm256_add_ps(fjy1,ty);
2536             fjz1             = _mm256_add_ps(fjz1,tz);
2537
2538             }
2539
2540             /**************************
2541              * CALCULATE INTERACTIONS *
2542              **************************/
2543
2544             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2545             {
2546
2547             r12              = _mm256_mul_ps(rsq12,rinv12);
2548             r12              = _mm256_andnot_ps(dummy_mask,r12);
2549
2550             /* EWALD ELECTROSTATICS */
2551             
2552             /* Analytical PME correction */
2553             zeta2            = _mm256_mul_ps(beta2,rsq12);
2554             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2555             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2556             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2557             felec            = _mm256_mul_ps(qq12,felec);
2558             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2559             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2560             velec            = _mm256_sub_ps(rinv12,pmecorrV);
2561             velec            = _mm256_mul_ps(qq12,velec);
2562             
2563             d                = _mm256_sub_ps(r12,rswitch);
2564             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2565             d2               = _mm256_mul_ps(d,d);
2566             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2567
2568             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2569
2570             /* Evaluate switch function */
2571             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2572             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv12,_mm256_mul_ps(velec,dsw)) );
2573             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2574
2575             fscal            = felec;
2576
2577             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2578
2579             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2580
2581             /* Calculate temporary vectorial force */
2582             tx               = _mm256_mul_ps(fscal,dx12);
2583             ty               = _mm256_mul_ps(fscal,dy12);
2584             tz               = _mm256_mul_ps(fscal,dz12);
2585
2586             /* Update vectorial force */
2587             fix1             = _mm256_add_ps(fix1,tx);
2588             fiy1             = _mm256_add_ps(fiy1,ty);
2589             fiz1             = _mm256_add_ps(fiz1,tz);
2590
2591             fjx2             = _mm256_add_ps(fjx2,tx);
2592             fjy2             = _mm256_add_ps(fjy2,ty);
2593             fjz2             = _mm256_add_ps(fjz2,tz);
2594
2595             }
2596
2597             /**************************
2598              * CALCULATE INTERACTIONS *
2599              **************************/
2600
2601             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2602             {
2603
2604             r13              = _mm256_mul_ps(rsq13,rinv13);
2605             r13              = _mm256_andnot_ps(dummy_mask,r13);
2606
2607             /* EWALD ELECTROSTATICS */
2608             
2609             /* Analytical PME correction */
2610             zeta2            = _mm256_mul_ps(beta2,rsq13);
2611             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
2612             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2613             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2614             felec            = _mm256_mul_ps(qq13,felec);
2615             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2616             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2617             velec            = _mm256_sub_ps(rinv13,pmecorrV);
2618             velec            = _mm256_mul_ps(qq13,velec);
2619             
2620             d                = _mm256_sub_ps(r13,rswitch);
2621             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2622             d2               = _mm256_mul_ps(d,d);
2623             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2624
2625             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2626
2627             /* Evaluate switch function */
2628             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2629             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv13,_mm256_mul_ps(velec,dsw)) );
2630             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2631
2632             fscal            = felec;
2633
2634             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2635
2636             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2637
2638             /* Calculate temporary vectorial force */
2639             tx               = _mm256_mul_ps(fscal,dx13);
2640             ty               = _mm256_mul_ps(fscal,dy13);
2641             tz               = _mm256_mul_ps(fscal,dz13);
2642
2643             /* Update vectorial force */
2644             fix1             = _mm256_add_ps(fix1,tx);
2645             fiy1             = _mm256_add_ps(fiy1,ty);
2646             fiz1             = _mm256_add_ps(fiz1,tz);
2647
2648             fjx3             = _mm256_add_ps(fjx3,tx);
2649             fjy3             = _mm256_add_ps(fjy3,ty);
2650             fjz3             = _mm256_add_ps(fjz3,tz);
2651
2652             }
2653
2654             /**************************
2655              * CALCULATE INTERACTIONS *
2656              **************************/
2657
2658             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2659             {
2660
2661             r21              = _mm256_mul_ps(rsq21,rinv21);
2662             r21              = _mm256_andnot_ps(dummy_mask,r21);
2663
2664             /* EWALD ELECTROSTATICS */
2665             
2666             /* Analytical PME correction */
2667             zeta2            = _mm256_mul_ps(beta2,rsq21);
2668             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2669             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2670             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2671             felec            = _mm256_mul_ps(qq21,felec);
2672             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2673             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2674             velec            = _mm256_sub_ps(rinv21,pmecorrV);
2675             velec            = _mm256_mul_ps(qq21,velec);
2676             
2677             d                = _mm256_sub_ps(r21,rswitch);
2678             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2679             d2               = _mm256_mul_ps(d,d);
2680             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2681
2682             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2683
2684             /* Evaluate switch function */
2685             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2686             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv21,_mm256_mul_ps(velec,dsw)) );
2687             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2688
2689             fscal            = felec;
2690
2691             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2692
2693             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2694
2695             /* Calculate temporary vectorial force */
2696             tx               = _mm256_mul_ps(fscal,dx21);
2697             ty               = _mm256_mul_ps(fscal,dy21);
2698             tz               = _mm256_mul_ps(fscal,dz21);
2699
2700             /* Update vectorial force */
2701             fix2             = _mm256_add_ps(fix2,tx);
2702             fiy2             = _mm256_add_ps(fiy2,ty);
2703             fiz2             = _mm256_add_ps(fiz2,tz);
2704
2705             fjx1             = _mm256_add_ps(fjx1,tx);
2706             fjy1             = _mm256_add_ps(fjy1,ty);
2707             fjz1             = _mm256_add_ps(fjz1,tz);
2708
2709             }
2710
2711             /**************************
2712              * CALCULATE INTERACTIONS *
2713              **************************/
2714
2715             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2716             {
2717
2718             r22              = _mm256_mul_ps(rsq22,rinv22);
2719             r22              = _mm256_andnot_ps(dummy_mask,r22);
2720
2721             /* EWALD ELECTROSTATICS */
2722             
2723             /* Analytical PME correction */
2724             zeta2            = _mm256_mul_ps(beta2,rsq22);
2725             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2726             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2727             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2728             felec            = _mm256_mul_ps(qq22,felec);
2729             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2730             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2731             velec            = _mm256_sub_ps(rinv22,pmecorrV);
2732             velec            = _mm256_mul_ps(qq22,velec);
2733             
2734             d                = _mm256_sub_ps(r22,rswitch);
2735             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2736             d2               = _mm256_mul_ps(d,d);
2737             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2738
2739             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2740
2741             /* Evaluate switch function */
2742             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2743             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv22,_mm256_mul_ps(velec,dsw)) );
2744             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2745
2746             fscal            = felec;
2747
2748             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2749
2750             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2751
2752             /* Calculate temporary vectorial force */
2753             tx               = _mm256_mul_ps(fscal,dx22);
2754             ty               = _mm256_mul_ps(fscal,dy22);
2755             tz               = _mm256_mul_ps(fscal,dz22);
2756
2757             /* Update vectorial force */
2758             fix2             = _mm256_add_ps(fix2,tx);
2759             fiy2             = _mm256_add_ps(fiy2,ty);
2760             fiz2             = _mm256_add_ps(fiz2,tz);
2761
2762             fjx2             = _mm256_add_ps(fjx2,tx);
2763             fjy2             = _mm256_add_ps(fjy2,ty);
2764             fjz2             = _mm256_add_ps(fjz2,tz);
2765
2766             }
2767
2768             /**************************
2769              * CALCULATE INTERACTIONS *
2770              **************************/
2771
2772             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2773             {
2774
2775             r23              = _mm256_mul_ps(rsq23,rinv23);
2776             r23              = _mm256_andnot_ps(dummy_mask,r23);
2777
2778             /* EWALD ELECTROSTATICS */
2779             
2780             /* Analytical PME correction */
2781             zeta2            = _mm256_mul_ps(beta2,rsq23);
2782             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
2783             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2784             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2785             felec            = _mm256_mul_ps(qq23,felec);
2786             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2787             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2788             velec            = _mm256_sub_ps(rinv23,pmecorrV);
2789             velec            = _mm256_mul_ps(qq23,velec);
2790             
2791             d                = _mm256_sub_ps(r23,rswitch);
2792             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2793             d2               = _mm256_mul_ps(d,d);
2794             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2795
2796             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2797
2798             /* Evaluate switch function */
2799             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2800             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv23,_mm256_mul_ps(velec,dsw)) );
2801             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2802
2803             fscal            = felec;
2804
2805             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2806
2807             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2808
2809             /* Calculate temporary vectorial force */
2810             tx               = _mm256_mul_ps(fscal,dx23);
2811             ty               = _mm256_mul_ps(fscal,dy23);
2812             tz               = _mm256_mul_ps(fscal,dz23);
2813
2814             /* Update vectorial force */
2815             fix2             = _mm256_add_ps(fix2,tx);
2816             fiy2             = _mm256_add_ps(fiy2,ty);
2817             fiz2             = _mm256_add_ps(fiz2,tz);
2818
2819             fjx3             = _mm256_add_ps(fjx3,tx);
2820             fjy3             = _mm256_add_ps(fjy3,ty);
2821             fjz3             = _mm256_add_ps(fjz3,tz);
2822
2823             }
2824
2825             /**************************
2826              * CALCULATE INTERACTIONS *
2827              **************************/
2828
2829             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2830             {
2831
2832             r31              = _mm256_mul_ps(rsq31,rinv31);
2833             r31              = _mm256_andnot_ps(dummy_mask,r31);
2834
2835             /* EWALD ELECTROSTATICS */
2836             
2837             /* Analytical PME correction */
2838             zeta2            = _mm256_mul_ps(beta2,rsq31);
2839             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
2840             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2841             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2842             felec            = _mm256_mul_ps(qq31,felec);
2843             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2844             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2845             velec            = _mm256_sub_ps(rinv31,pmecorrV);
2846             velec            = _mm256_mul_ps(qq31,velec);
2847             
2848             d                = _mm256_sub_ps(r31,rswitch);
2849             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2850             d2               = _mm256_mul_ps(d,d);
2851             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2852
2853             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2854
2855             /* Evaluate switch function */
2856             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2857             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv31,_mm256_mul_ps(velec,dsw)) );
2858             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2859
2860             fscal            = felec;
2861
2862             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2863
2864             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2865
2866             /* Calculate temporary vectorial force */
2867             tx               = _mm256_mul_ps(fscal,dx31);
2868             ty               = _mm256_mul_ps(fscal,dy31);
2869             tz               = _mm256_mul_ps(fscal,dz31);
2870
2871             /* Update vectorial force */
2872             fix3             = _mm256_add_ps(fix3,tx);
2873             fiy3             = _mm256_add_ps(fiy3,ty);
2874             fiz3             = _mm256_add_ps(fiz3,tz);
2875
2876             fjx1             = _mm256_add_ps(fjx1,tx);
2877             fjy1             = _mm256_add_ps(fjy1,ty);
2878             fjz1             = _mm256_add_ps(fjz1,tz);
2879
2880             }
2881
2882             /**************************
2883              * CALCULATE INTERACTIONS *
2884              **************************/
2885
2886             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2887             {
2888
2889             r32              = _mm256_mul_ps(rsq32,rinv32);
2890             r32              = _mm256_andnot_ps(dummy_mask,r32);
2891
2892             /* EWALD ELECTROSTATICS */
2893             
2894             /* Analytical PME correction */
2895             zeta2            = _mm256_mul_ps(beta2,rsq32);
2896             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
2897             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2898             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2899             felec            = _mm256_mul_ps(qq32,felec);
2900             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2901             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2902             velec            = _mm256_sub_ps(rinv32,pmecorrV);
2903             velec            = _mm256_mul_ps(qq32,velec);
2904             
2905             d                = _mm256_sub_ps(r32,rswitch);
2906             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2907             d2               = _mm256_mul_ps(d,d);
2908             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2909
2910             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2911
2912             /* Evaluate switch function */
2913             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2914             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv32,_mm256_mul_ps(velec,dsw)) );
2915             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2916
2917             fscal            = felec;
2918
2919             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2920
2921             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2922
2923             /* Calculate temporary vectorial force */
2924             tx               = _mm256_mul_ps(fscal,dx32);
2925             ty               = _mm256_mul_ps(fscal,dy32);
2926             tz               = _mm256_mul_ps(fscal,dz32);
2927
2928             /* Update vectorial force */
2929             fix3             = _mm256_add_ps(fix3,tx);
2930             fiy3             = _mm256_add_ps(fiy3,ty);
2931             fiz3             = _mm256_add_ps(fiz3,tz);
2932
2933             fjx2             = _mm256_add_ps(fjx2,tx);
2934             fjy2             = _mm256_add_ps(fjy2,ty);
2935             fjz2             = _mm256_add_ps(fjz2,tz);
2936
2937             }
2938
2939             /**************************
2940              * CALCULATE INTERACTIONS *
2941              **************************/
2942
2943             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2944             {
2945
2946             r33              = _mm256_mul_ps(rsq33,rinv33);
2947             r33              = _mm256_andnot_ps(dummy_mask,r33);
2948
2949             /* EWALD ELECTROSTATICS */
2950             
2951             /* Analytical PME correction */
2952             zeta2            = _mm256_mul_ps(beta2,rsq33);
2953             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
2954             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2955             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2956             felec            = _mm256_mul_ps(qq33,felec);
2957             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
2958             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
2959             velec            = _mm256_sub_ps(rinv33,pmecorrV);
2960             velec            = _mm256_mul_ps(qq33,velec);
2961             
2962             d                = _mm256_sub_ps(r33,rswitch);
2963             d                = _mm256_max_ps(d,_mm256_setzero_ps());
2964             d2               = _mm256_mul_ps(d,d);
2965             sw               = _mm256_add_ps(one,_mm256_mul_ps(d2,_mm256_mul_ps(d,_mm256_add_ps(swV3,_mm256_mul_ps(d,_mm256_add_ps(swV4,_mm256_mul_ps(d,swV5)))))));
2966
2967             dsw              = _mm256_mul_ps(d2,_mm256_add_ps(swF2,_mm256_mul_ps(d,_mm256_add_ps(swF3,_mm256_mul_ps(d,swF4)))));
2968
2969             /* Evaluate switch function */
2970             /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
2971             felec            = _mm256_sub_ps( _mm256_mul_ps(felec,sw) , _mm256_mul_ps(rinv33,_mm256_mul_ps(velec,dsw)) );
2972             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2973
2974             fscal            = felec;
2975
2976             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2977
2978             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2979
2980             /* Calculate temporary vectorial force */
2981             tx               = _mm256_mul_ps(fscal,dx33);
2982             ty               = _mm256_mul_ps(fscal,dy33);
2983             tz               = _mm256_mul_ps(fscal,dz33);
2984
2985             /* Update vectorial force */
2986             fix3             = _mm256_add_ps(fix3,tx);
2987             fiy3             = _mm256_add_ps(fiy3,ty);
2988             fiz3             = _mm256_add_ps(fiz3,tz);
2989
2990             fjx3             = _mm256_add_ps(fjx3,tx);
2991             fjy3             = _mm256_add_ps(fjy3,ty);
2992             fjz3             = _mm256_add_ps(fjz3,tz);
2993
2994             }
2995
2996             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2997             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2998             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2999             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3000             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3001             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3002             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3003             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3004
3005             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
3006                                                       fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
3007                                                       fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3008
3009             /* Inner loop uses 954 flops */
3010         }
3011
3012         /* End of innermost loop */
3013
3014         gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3015                                                  f+i_coord_offset+DIM,fshift+i_shift_offset);
3016
3017         /* Increment number of inner iterations */
3018         inneriter                  += j_index_end - j_index_start;
3019
3020         /* Outer loop uses 18 flops */
3021     }
3022
3023     /* Increment number of outer iterations */
3024     outeriter        += nri;
3025
3026     /* Update outer/inner flops */
3027
3028     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*954);
3029 }