d0bc7df10102a4fa897b95d62bb75f08bd213eb4
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "config.h"
39
40 #include <math.h>
41
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
46
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
49
50 /*
51  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_single
52  * Electrostatics interaction: Ewald
53  * VdW interaction:            LennardJones
54  * Geometry:                   Water4-Water4
55  * Calculate force/pot:        PotentialAndForce
56  */
57 void
58 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_VF_avx_256_single
59                     (t_nblist                    * gmx_restrict       nlist,
60                      rvec                        * gmx_restrict          xx,
61                      rvec                        * gmx_restrict          ff,
62                      t_forcerec                  * gmx_restrict          fr,
63                      t_mdatoms                   * gmx_restrict     mdatoms,
64                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65                      t_nrnb                      * gmx_restrict        nrnb)
66 {
67     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
68      * just 0 for non-waters.
69      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70      * jnr indices corresponding to data put in the four positions in the SIMD register.
71      */
72     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
73     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74     int              jnrA,jnrB,jnrC,jnrD;
75     int              jnrE,jnrF,jnrG,jnrH;
76     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
81     real             rcutoff_scalar;
82     real             *shiftvec,*fshift,*x,*f;
83     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84     real             scratch[4*DIM];
85     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86     real *           vdwioffsetptr0;
87     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     real *           vdwioffsetptr1;
89     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90     real *           vdwioffsetptr2;
91     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92     real *           vdwioffsetptr3;
93     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
101     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
102     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
103     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
104     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
105     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
106     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
107     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
108     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
109     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
110     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
111     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
112     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
113     real             *charge;
114     int              nvdwtype;
115     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
116     int              *vdwtype;
117     real             *vdwparam;
118     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
119     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
120     __m256i          ewitab;
121     __m128i          ewitab_lo,ewitab_hi;
122     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
123     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
124     real             *ewtab;
125     __m256           dummy_mask,cutoff_mask;
126     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
127     __m256           one     = _mm256_set1_ps(1.0);
128     __m256           two     = _mm256_set1_ps(2.0);
129     x                = xx[0];
130     f                = ff[0];
131
132     nri              = nlist->nri;
133     iinr             = nlist->iinr;
134     jindex           = nlist->jindex;
135     jjnr             = nlist->jjnr;
136     shiftidx         = nlist->shift;
137     gid              = nlist->gid;
138     shiftvec         = fr->shift_vec[0];
139     fshift           = fr->fshift[0];
140     facel            = _mm256_set1_ps(fr->epsfac);
141     charge           = mdatoms->chargeA;
142     nvdwtype         = fr->ntype;
143     vdwparam         = fr->nbfp;
144     vdwtype          = mdatoms->typeA;
145
146     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
147     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
148     beta2            = _mm256_mul_ps(beta,beta);
149     beta3            = _mm256_mul_ps(beta,beta2);
150
151     ewtab            = fr->ic->tabq_coul_FDV0;
152     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
153     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
154
155     /* Setup water-specific parameters */
156     inr              = nlist->iinr[0];
157     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
158     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
159     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
160     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
161
162     jq1              = _mm256_set1_ps(charge[inr+1]);
163     jq2              = _mm256_set1_ps(charge[inr+2]);
164     jq3              = _mm256_set1_ps(charge[inr+3]);
165     vdwjidx0A        = 2*vdwtype[inr+0];
166     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
167     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
168     qq11             = _mm256_mul_ps(iq1,jq1);
169     qq12             = _mm256_mul_ps(iq1,jq2);
170     qq13             = _mm256_mul_ps(iq1,jq3);
171     qq21             = _mm256_mul_ps(iq2,jq1);
172     qq22             = _mm256_mul_ps(iq2,jq2);
173     qq23             = _mm256_mul_ps(iq2,jq3);
174     qq31             = _mm256_mul_ps(iq3,jq1);
175     qq32             = _mm256_mul_ps(iq3,jq2);
176     qq33             = _mm256_mul_ps(iq3,jq3);
177
178     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
179     rcutoff_scalar   = fr->rcoulomb;
180     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
181     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
182
183     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
184     rvdw             = _mm256_set1_ps(fr->rvdw);
185
186     /* Avoid stupid compiler warnings */
187     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
188     j_coord_offsetA = 0;
189     j_coord_offsetB = 0;
190     j_coord_offsetC = 0;
191     j_coord_offsetD = 0;
192     j_coord_offsetE = 0;
193     j_coord_offsetF = 0;
194     j_coord_offsetG = 0;
195     j_coord_offsetH = 0;
196
197     outeriter        = 0;
198     inneriter        = 0;
199
200     for(iidx=0;iidx<4*DIM;iidx++)
201     {
202         scratch[iidx] = 0.0;
203     }
204
205     /* Start outer loop over neighborlists */
206     for(iidx=0; iidx<nri; iidx++)
207     {
208         /* Load shift vector for this list */
209         i_shift_offset   = DIM*shiftidx[iidx];
210
211         /* Load limits for loop over neighbors */
212         j_index_start    = jindex[iidx];
213         j_index_end      = jindex[iidx+1];
214
215         /* Get outer coordinate index */
216         inr              = iinr[iidx];
217         i_coord_offset   = DIM*inr;
218
219         /* Load i particle coords and add shift vector */
220         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
221                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
222
223         fix0             = _mm256_setzero_ps();
224         fiy0             = _mm256_setzero_ps();
225         fiz0             = _mm256_setzero_ps();
226         fix1             = _mm256_setzero_ps();
227         fiy1             = _mm256_setzero_ps();
228         fiz1             = _mm256_setzero_ps();
229         fix2             = _mm256_setzero_ps();
230         fiy2             = _mm256_setzero_ps();
231         fiz2             = _mm256_setzero_ps();
232         fix3             = _mm256_setzero_ps();
233         fiy3             = _mm256_setzero_ps();
234         fiz3             = _mm256_setzero_ps();
235
236         /* Reset potential sums */
237         velecsum         = _mm256_setzero_ps();
238         vvdwsum          = _mm256_setzero_ps();
239
240         /* Start inner kernel loop */
241         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
242         {
243
244             /* Get j neighbor index, and coordinate index */
245             jnrA             = jjnr[jidx];
246             jnrB             = jjnr[jidx+1];
247             jnrC             = jjnr[jidx+2];
248             jnrD             = jjnr[jidx+3];
249             jnrE             = jjnr[jidx+4];
250             jnrF             = jjnr[jidx+5];
251             jnrG             = jjnr[jidx+6];
252             jnrH             = jjnr[jidx+7];
253             j_coord_offsetA  = DIM*jnrA;
254             j_coord_offsetB  = DIM*jnrB;
255             j_coord_offsetC  = DIM*jnrC;
256             j_coord_offsetD  = DIM*jnrD;
257             j_coord_offsetE  = DIM*jnrE;
258             j_coord_offsetF  = DIM*jnrF;
259             j_coord_offsetG  = DIM*jnrG;
260             j_coord_offsetH  = DIM*jnrH;
261
262             /* load j atom coordinates */
263             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
264                                                  x+j_coord_offsetC,x+j_coord_offsetD,
265                                                  x+j_coord_offsetE,x+j_coord_offsetF,
266                                                  x+j_coord_offsetG,x+j_coord_offsetH,
267                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
268                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
269
270             /* Calculate displacement vector */
271             dx00             = _mm256_sub_ps(ix0,jx0);
272             dy00             = _mm256_sub_ps(iy0,jy0);
273             dz00             = _mm256_sub_ps(iz0,jz0);
274             dx11             = _mm256_sub_ps(ix1,jx1);
275             dy11             = _mm256_sub_ps(iy1,jy1);
276             dz11             = _mm256_sub_ps(iz1,jz1);
277             dx12             = _mm256_sub_ps(ix1,jx2);
278             dy12             = _mm256_sub_ps(iy1,jy2);
279             dz12             = _mm256_sub_ps(iz1,jz2);
280             dx13             = _mm256_sub_ps(ix1,jx3);
281             dy13             = _mm256_sub_ps(iy1,jy3);
282             dz13             = _mm256_sub_ps(iz1,jz3);
283             dx21             = _mm256_sub_ps(ix2,jx1);
284             dy21             = _mm256_sub_ps(iy2,jy1);
285             dz21             = _mm256_sub_ps(iz2,jz1);
286             dx22             = _mm256_sub_ps(ix2,jx2);
287             dy22             = _mm256_sub_ps(iy2,jy2);
288             dz22             = _mm256_sub_ps(iz2,jz2);
289             dx23             = _mm256_sub_ps(ix2,jx3);
290             dy23             = _mm256_sub_ps(iy2,jy3);
291             dz23             = _mm256_sub_ps(iz2,jz3);
292             dx31             = _mm256_sub_ps(ix3,jx1);
293             dy31             = _mm256_sub_ps(iy3,jy1);
294             dz31             = _mm256_sub_ps(iz3,jz1);
295             dx32             = _mm256_sub_ps(ix3,jx2);
296             dy32             = _mm256_sub_ps(iy3,jy2);
297             dz32             = _mm256_sub_ps(iz3,jz2);
298             dx33             = _mm256_sub_ps(ix3,jx3);
299             dy33             = _mm256_sub_ps(iy3,jy3);
300             dz33             = _mm256_sub_ps(iz3,jz3);
301
302             /* Calculate squared distance and things based on it */
303             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
304             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
305             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
306             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
307             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
308             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
309             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
310             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
311             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
312             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
313
314             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
315             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
316             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
317             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
318             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
319             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
320             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
321             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
322             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
323
324             rinvsq00         = gmx_mm256_inv_ps(rsq00);
325             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
326             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
327             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
328             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
329             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
330             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
331             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
332             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
333             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
334
335             fjx0             = _mm256_setzero_ps();
336             fjy0             = _mm256_setzero_ps();
337             fjz0             = _mm256_setzero_ps();
338             fjx1             = _mm256_setzero_ps();
339             fjy1             = _mm256_setzero_ps();
340             fjz1             = _mm256_setzero_ps();
341             fjx2             = _mm256_setzero_ps();
342             fjy2             = _mm256_setzero_ps();
343             fjz2             = _mm256_setzero_ps();
344             fjx3             = _mm256_setzero_ps();
345             fjy3             = _mm256_setzero_ps();
346             fjz3             = _mm256_setzero_ps();
347
348             /**************************
349              * CALCULATE INTERACTIONS *
350              **************************/
351
352             if (gmx_mm256_any_lt(rsq00,rcutoff2))
353             {
354
355             /* LENNARD-JONES DISPERSION/REPULSION */
356
357             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
358             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
359             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
360             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
361                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
362             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
363
364             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
365
366             /* Update potential sum for this i atom from the interaction with this j atom. */
367             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
368             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
369
370             fscal            = fvdw;
371
372             fscal            = _mm256_and_ps(fscal,cutoff_mask);
373
374             /* Calculate temporary vectorial force */
375             tx               = _mm256_mul_ps(fscal,dx00);
376             ty               = _mm256_mul_ps(fscal,dy00);
377             tz               = _mm256_mul_ps(fscal,dz00);
378
379             /* Update vectorial force */
380             fix0             = _mm256_add_ps(fix0,tx);
381             fiy0             = _mm256_add_ps(fiy0,ty);
382             fiz0             = _mm256_add_ps(fiz0,tz);
383
384             fjx0             = _mm256_add_ps(fjx0,tx);
385             fjy0             = _mm256_add_ps(fjy0,ty);
386             fjz0             = _mm256_add_ps(fjz0,tz);
387
388             }
389
390             /**************************
391              * CALCULATE INTERACTIONS *
392              **************************/
393
394             if (gmx_mm256_any_lt(rsq11,rcutoff2))
395             {
396
397             r11              = _mm256_mul_ps(rsq11,rinv11);
398
399             /* EWALD ELECTROSTATICS */
400             
401             /* Analytical PME correction */
402             zeta2            = _mm256_mul_ps(beta2,rsq11);
403             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
404             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
405             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
406             felec            = _mm256_mul_ps(qq11,felec);
407             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
408             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
409             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
410             velec            = _mm256_mul_ps(qq11,velec);
411             
412             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
413
414             /* Update potential sum for this i atom from the interaction with this j atom. */
415             velec            = _mm256_and_ps(velec,cutoff_mask);
416             velecsum         = _mm256_add_ps(velecsum,velec);
417
418             fscal            = felec;
419
420             fscal            = _mm256_and_ps(fscal,cutoff_mask);
421
422             /* Calculate temporary vectorial force */
423             tx               = _mm256_mul_ps(fscal,dx11);
424             ty               = _mm256_mul_ps(fscal,dy11);
425             tz               = _mm256_mul_ps(fscal,dz11);
426
427             /* Update vectorial force */
428             fix1             = _mm256_add_ps(fix1,tx);
429             fiy1             = _mm256_add_ps(fiy1,ty);
430             fiz1             = _mm256_add_ps(fiz1,tz);
431
432             fjx1             = _mm256_add_ps(fjx1,tx);
433             fjy1             = _mm256_add_ps(fjy1,ty);
434             fjz1             = _mm256_add_ps(fjz1,tz);
435
436             }
437
438             /**************************
439              * CALCULATE INTERACTIONS *
440              **************************/
441
442             if (gmx_mm256_any_lt(rsq12,rcutoff2))
443             {
444
445             r12              = _mm256_mul_ps(rsq12,rinv12);
446
447             /* EWALD ELECTROSTATICS */
448             
449             /* Analytical PME correction */
450             zeta2            = _mm256_mul_ps(beta2,rsq12);
451             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
452             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
453             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
454             felec            = _mm256_mul_ps(qq12,felec);
455             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
456             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
457             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
458             velec            = _mm256_mul_ps(qq12,velec);
459             
460             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
461
462             /* Update potential sum for this i atom from the interaction with this j atom. */
463             velec            = _mm256_and_ps(velec,cutoff_mask);
464             velecsum         = _mm256_add_ps(velecsum,velec);
465
466             fscal            = felec;
467
468             fscal            = _mm256_and_ps(fscal,cutoff_mask);
469
470             /* Calculate temporary vectorial force */
471             tx               = _mm256_mul_ps(fscal,dx12);
472             ty               = _mm256_mul_ps(fscal,dy12);
473             tz               = _mm256_mul_ps(fscal,dz12);
474
475             /* Update vectorial force */
476             fix1             = _mm256_add_ps(fix1,tx);
477             fiy1             = _mm256_add_ps(fiy1,ty);
478             fiz1             = _mm256_add_ps(fiz1,tz);
479
480             fjx2             = _mm256_add_ps(fjx2,tx);
481             fjy2             = _mm256_add_ps(fjy2,ty);
482             fjz2             = _mm256_add_ps(fjz2,tz);
483
484             }
485
486             /**************************
487              * CALCULATE INTERACTIONS *
488              **************************/
489
490             if (gmx_mm256_any_lt(rsq13,rcutoff2))
491             {
492
493             r13              = _mm256_mul_ps(rsq13,rinv13);
494
495             /* EWALD ELECTROSTATICS */
496             
497             /* Analytical PME correction */
498             zeta2            = _mm256_mul_ps(beta2,rsq13);
499             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
500             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
501             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
502             felec            = _mm256_mul_ps(qq13,felec);
503             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
504             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
505             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv13,sh_ewald),pmecorrV);
506             velec            = _mm256_mul_ps(qq13,velec);
507             
508             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
509
510             /* Update potential sum for this i atom from the interaction with this j atom. */
511             velec            = _mm256_and_ps(velec,cutoff_mask);
512             velecsum         = _mm256_add_ps(velecsum,velec);
513
514             fscal            = felec;
515
516             fscal            = _mm256_and_ps(fscal,cutoff_mask);
517
518             /* Calculate temporary vectorial force */
519             tx               = _mm256_mul_ps(fscal,dx13);
520             ty               = _mm256_mul_ps(fscal,dy13);
521             tz               = _mm256_mul_ps(fscal,dz13);
522
523             /* Update vectorial force */
524             fix1             = _mm256_add_ps(fix1,tx);
525             fiy1             = _mm256_add_ps(fiy1,ty);
526             fiz1             = _mm256_add_ps(fiz1,tz);
527
528             fjx3             = _mm256_add_ps(fjx3,tx);
529             fjy3             = _mm256_add_ps(fjy3,ty);
530             fjz3             = _mm256_add_ps(fjz3,tz);
531
532             }
533
534             /**************************
535              * CALCULATE INTERACTIONS *
536              **************************/
537
538             if (gmx_mm256_any_lt(rsq21,rcutoff2))
539             {
540
541             r21              = _mm256_mul_ps(rsq21,rinv21);
542
543             /* EWALD ELECTROSTATICS */
544             
545             /* Analytical PME correction */
546             zeta2            = _mm256_mul_ps(beta2,rsq21);
547             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
548             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
549             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
550             felec            = _mm256_mul_ps(qq21,felec);
551             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
552             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
553             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
554             velec            = _mm256_mul_ps(qq21,velec);
555             
556             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
557
558             /* Update potential sum for this i atom from the interaction with this j atom. */
559             velec            = _mm256_and_ps(velec,cutoff_mask);
560             velecsum         = _mm256_add_ps(velecsum,velec);
561
562             fscal            = felec;
563
564             fscal            = _mm256_and_ps(fscal,cutoff_mask);
565
566             /* Calculate temporary vectorial force */
567             tx               = _mm256_mul_ps(fscal,dx21);
568             ty               = _mm256_mul_ps(fscal,dy21);
569             tz               = _mm256_mul_ps(fscal,dz21);
570
571             /* Update vectorial force */
572             fix2             = _mm256_add_ps(fix2,tx);
573             fiy2             = _mm256_add_ps(fiy2,ty);
574             fiz2             = _mm256_add_ps(fiz2,tz);
575
576             fjx1             = _mm256_add_ps(fjx1,tx);
577             fjy1             = _mm256_add_ps(fjy1,ty);
578             fjz1             = _mm256_add_ps(fjz1,tz);
579
580             }
581
582             /**************************
583              * CALCULATE INTERACTIONS *
584              **************************/
585
586             if (gmx_mm256_any_lt(rsq22,rcutoff2))
587             {
588
589             r22              = _mm256_mul_ps(rsq22,rinv22);
590
591             /* EWALD ELECTROSTATICS */
592             
593             /* Analytical PME correction */
594             zeta2            = _mm256_mul_ps(beta2,rsq22);
595             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
596             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
597             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
598             felec            = _mm256_mul_ps(qq22,felec);
599             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
600             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
601             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
602             velec            = _mm256_mul_ps(qq22,velec);
603             
604             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
605
606             /* Update potential sum for this i atom from the interaction with this j atom. */
607             velec            = _mm256_and_ps(velec,cutoff_mask);
608             velecsum         = _mm256_add_ps(velecsum,velec);
609
610             fscal            = felec;
611
612             fscal            = _mm256_and_ps(fscal,cutoff_mask);
613
614             /* Calculate temporary vectorial force */
615             tx               = _mm256_mul_ps(fscal,dx22);
616             ty               = _mm256_mul_ps(fscal,dy22);
617             tz               = _mm256_mul_ps(fscal,dz22);
618
619             /* Update vectorial force */
620             fix2             = _mm256_add_ps(fix2,tx);
621             fiy2             = _mm256_add_ps(fiy2,ty);
622             fiz2             = _mm256_add_ps(fiz2,tz);
623
624             fjx2             = _mm256_add_ps(fjx2,tx);
625             fjy2             = _mm256_add_ps(fjy2,ty);
626             fjz2             = _mm256_add_ps(fjz2,tz);
627
628             }
629
630             /**************************
631              * CALCULATE INTERACTIONS *
632              **************************/
633
634             if (gmx_mm256_any_lt(rsq23,rcutoff2))
635             {
636
637             r23              = _mm256_mul_ps(rsq23,rinv23);
638
639             /* EWALD ELECTROSTATICS */
640             
641             /* Analytical PME correction */
642             zeta2            = _mm256_mul_ps(beta2,rsq23);
643             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
644             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
645             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
646             felec            = _mm256_mul_ps(qq23,felec);
647             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
648             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
649             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv23,sh_ewald),pmecorrV);
650             velec            = _mm256_mul_ps(qq23,velec);
651             
652             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
653
654             /* Update potential sum for this i atom from the interaction with this j atom. */
655             velec            = _mm256_and_ps(velec,cutoff_mask);
656             velecsum         = _mm256_add_ps(velecsum,velec);
657
658             fscal            = felec;
659
660             fscal            = _mm256_and_ps(fscal,cutoff_mask);
661
662             /* Calculate temporary vectorial force */
663             tx               = _mm256_mul_ps(fscal,dx23);
664             ty               = _mm256_mul_ps(fscal,dy23);
665             tz               = _mm256_mul_ps(fscal,dz23);
666
667             /* Update vectorial force */
668             fix2             = _mm256_add_ps(fix2,tx);
669             fiy2             = _mm256_add_ps(fiy2,ty);
670             fiz2             = _mm256_add_ps(fiz2,tz);
671
672             fjx3             = _mm256_add_ps(fjx3,tx);
673             fjy3             = _mm256_add_ps(fjy3,ty);
674             fjz3             = _mm256_add_ps(fjz3,tz);
675
676             }
677
678             /**************************
679              * CALCULATE INTERACTIONS *
680              **************************/
681
682             if (gmx_mm256_any_lt(rsq31,rcutoff2))
683             {
684
685             r31              = _mm256_mul_ps(rsq31,rinv31);
686
687             /* EWALD ELECTROSTATICS */
688             
689             /* Analytical PME correction */
690             zeta2            = _mm256_mul_ps(beta2,rsq31);
691             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
692             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
693             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
694             felec            = _mm256_mul_ps(qq31,felec);
695             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
696             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
697             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv31,sh_ewald),pmecorrV);
698             velec            = _mm256_mul_ps(qq31,velec);
699             
700             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
701
702             /* Update potential sum for this i atom from the interaction with this j atom. */
703             velec            = _mm256_and_ps(velec,cutoff_mask);
704             velecsum         = _mm256_add_ps(velecsum,velec);
705
706             fscal            = felec;
707
708             fscal            = _mm256_and_ps(fscal,cutoff_mask);
709
710             /* Calculate temporary vectorial force */
711             tx               = _mm256_mul_ps(fscal,dx31);
712             ty               = _mm256_mul_ps(fscal,dy31);
713             tz               = _mm256_mul_ps(fscal,dz31);
714
715             /* Update vectorial force */
716             fix3             = _mm256_add_ps(fix3,tx);
717             fiy3             = _mm256_add_ps(fiy3,ty);
718             fiz3             = _mm256_add_ps(fiz3,tz);
719
720             fjx1             = _mm256_add_ps(fjx1,tx);
721             fjy1             = _mm256_add_ps(fjy1,ty);
722             fjz1             = _mm256_add_ps(fjz1,tz);
723
724             }
725
726             /**************************
727              * CALCULATE INTERACTIONS *
728              **************************/
729
730             if (gmx_mm256_any_lt(rsq32,rcutoff2))
731             {
732
733             r32              = _mm256_mul_ps(rsq32,rinv32);
734
735             /* EWALD ELECTROSTATICS */
736             
737             /* Analytical PME correction */
738             zeta2            = _mm256_mul_ps(beta2,rsq32);
739             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
740             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
741             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
742             felec            = _mm256_mul_ps(qq32,felec);
743             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
744             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
745             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv32,sh_ewald),pmecorrV);
746             velec            = _mm256_mul_ps(qq32,velec);
747             
748             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
749
750             /* Update potential sum for this i atom from the interaction with this j atom. */
751             velec            = _mm256_and_ps(velec,cutoff_mask);
752             velecsum         = _mm256_add_ps(velecsum,velec);
753
754             fscal            = felec;
755
756             fscal            = _mm256_and_ps(fscal,cutoff_mask);
757
758             /* Calculate temporary vectorial force */
759             tx               = _mm256_mul_ps(fscal,dx32);
760             ty               = _mm256_mul_ps(fscal,dy32);
761             tz               = _mm256_mul_ps(fscal,dz32);
762
763             /* Update vectorial force */
764             fix3             = _mm256_add_ps(fix3,tx);
765             fiy3             = _mm256_add_ps(fiy3,ty);
766             fiz3             = _mm256_add_ps(fiz3,tz);
767
768             fjx2             = _mm256_add_ps(fjx2,tx);
769             fjy2             = _mm256_add_ps(fjy2,ty);
770             fjz2             = _mm256_add_ps(fjz2,tz);
771
772             }
773
774             /**************************
775              * CALCULATE INTERACTIONS *
776              **************************/
777
778             if (gmx_mm256_any_lt(rsq33,rcutoff2))
779             {
780
781             r33              = _mm256_mul_ps(rsq33,rinv33);
782
783             /* EWALD ELECTROSTATICS */
784             
785             /* Analytical PME correction */
786             zeta2            = _mm256_mul_ps(beta2,rsq33);
787             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
788             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
789             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
790             felec            = _mm256_mul_ps(qq33,felec);
791             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
792             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
793             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv33,sh_ewald),pmecorrV);
794             velec            = _mm256_mul_ps(qq33,velec);
795             
796             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
797
798             /* Update potential sum for this i atom from the interaction with this j atom. */
799             velec            = _mm256_and_ps(velec,cutoff_mask);
800             velecsum         = _mm256_add_ps(velecsum,velec);
801
802             fscal            = felec;
803
804             fscal            = _mm256_and_ps(fscal,cutoff_mask);
805
806             /* Calculate temporary vectorial force */
807             tx               = _mm256_mul_ps(fscal,dx33);
808             ty               = _mm256_mul_ps(fscal,dy33);
809             tz               = _mm256_mul_ps(fscal,dz33);
810
811             /* Update vectorial force */
812             fix3             = _mm256_add_ps(fix3,tx);
813             fiy3             = _mm256_add_ps(fiy3,ty);
814             fiz3             = _mm256_add_ps(fiz3,tz);
815
816             fjx3             = _mm256_add_ps(fjx3,tx);
817             fjy3             = _mm256_add_ps(fjy3,ty);
818             fjz3             = _mm256_add_ps(fjz3,tz);
819
820             }
821
822             fjptrA             = f+j_coord_offsetA;
823             fjptrB             = f+j_coord_offsetB;
824             fjptrC             = f+j_coord_offsetC;
825             fjptrD             = f+j_coord_offsetD;
826             fjptrE             = f+j_coord_offsetE;
827             fjptrF             = f+j_coord_offsetF;
828             fjptrG             = f+j_coord_offsetG;
829             fjptrH             = f+j_coord_offsetH;
830
831             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
832                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
833                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
834
835             /* Inner loop uses 1025 flops */
836         }
837
838         if(jidx<j_index_end)
839         {
840
841             /* Get j neighbor index, and coordinate index */
842             jnrlistA         = jjnr[jidx];
843             jnrlistB         = jjnr[jidx+1];
844             jnrlistC         = jjnr[jidx+2];
845             jnrlistD         = jjnr[jidx+3];
846             jnrlistE         = jjnr[jidx+4];
847             jnrlistF         = jjnr[jidx+5];
848             jnrlistG         = jjnr[jidx+6];
849             jnrlistH         = jjnr[jidx+7];
850             /* Sign of each element will be negative for non-real atoms.
851              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
852              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
853              */
854             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
855                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
856                                             
857             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
858             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
859             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
860             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
861             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
862             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
863             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
864             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
865             j_coord_offsetA  = DIM*jnrA;
866             j_coord_offsetB  = DIM*jnrB;
867             j_coord_offsetC  = DIM*jnrC;
868             j_coord_offsetD  = DIM*jnrD;
869             j_coord_offsetE  = DIM*jnrE;
870             j_coord_offsetF  = DIM*jnrF;
871             j_coord_offsetG  = DIM*jnrG;
872             j_coord_offsetH  = DIM*jnrH;
873
874             /* load j atom coordinates */
875             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
876                                                  x+j_coord_offsetC,x+j_coord_offsetD,
877                                                  x+j_coord_offsetE,x+j_coord_offsetF,
878                                                  x+j_coord_offsetG,x+j_coord_offsetH,
879                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
880                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
881
882             /* Calculate displacement vector */
883             dx00             = _mm256_sub_ps(ix0,jx0);
884             dy00             = _mm256_sub_ps(iy0,jy0);
885             dz00             = _mm256_sub_ps(iz0,jz0);
886             dx11             = _mm256_sub_ps(ix1,jx1);
887             dy11             = _mm256_sub_ps(iy1,jy1);
888             dz11             = _mm256_sub_ps(iz1,jz1);
889             dx12             = _mm256_sub_ps(ix1,jx2);
890             dy12             = _mm256_sub_ps(iy1,jy2);
891             dz12             = _mm256_sub_ps(iz1,jz2);
892             dx13             = _mm256_sub_ps(ix1,jx3);
893             dy13             = _mm256_sub_ps(iy1,jy3);
894             dz13             = _mm256_sub_ps(iz1,jz3);
895             dx21             = _mm256_sub_ps(ix2,jx1);
896             dy21             = _mm256_sub_ps(iy2,jy1);
897             dz21             = _mm256_sub_ps(iz2,jz1);
898             dx22             = _mm256_sub_ps(ix2,jx2);
899             dy22             = _mm256_sub_ps(iy2,jy2);
900             dz22             = _mm256_sub_ps(iz2,jz2);
901             dx23             = _mm256_sub_ps(ix2,jx3);
902             dy23             = _mm256_sub_ps(iy2,jy3);
903             dz23             = _mm256_sub_ps(iz2,jz3);
904             dx31             = _mm256_sub_ps(ix3,jx1);
905             dy31             = _mm256_sub_ps(iy3,jy1);
906             dz31             = _mm256_sub_ps(iz3,jz1);
907             dx32             = _mm256_sub_ps(ix3,jx2);
908             dy32             = _mm256_sub_ps(iy3,jy2);
909             dz32             = _mm256_sub_ps(iz3,jz2);
910             dx33             = _mm256_sub_ps(ix3,jx3);
911             dy33             = _mm256_sub_ps(iy3,jy3);
912             dz33             = _mm256_sub_ps(iz3,jz3);
913
914             /* Calculate squared distance and things based on it */
915             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
916             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
917             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
918             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
919             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
920             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
921             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
922             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
923             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
924             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
925
926             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
927             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
928             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
929             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
930             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
931             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
932             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
933             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
934             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
935
936             rinvsq00         = gmx_mm256_inv_ps(rsq00);
937             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
938             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
939             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
940             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
941             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
942             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
943             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
944             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
945             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
946
947             fjx0             = _mm256_setzero_ps();
948             fjy0             = _mm256_setzero_ps();
949             fjz0             = _mm256_setzero_ps();
950             fjx1             = _mm256_setzero_ps();
951             fjy1             = _mm256_setzero_ps();
952             fjz1             = _mm256_setzero_ps();
953             fjx2             = _mm256_setzero_ps();
954             fjy2             = _mm256_setzero_ps();
955             fjz2             = _mm256_setzero_ps();
956             fjx3             = _mm256_setzero_ps();
957             fjy3             = _mm256_setzero_ps();
958             fjz3             = _mm256_setzero_ps();
959
960             /**************************
961              * CALCULATE INTERACTIONS *
962              **************************/
963
964             if (gmx_mm256_any_lt(rsq00,rcutoff2))
965             {
966
967             /* LENNARD-JONES DISPERSION/REPULSION */
968
969             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
970             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
971             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
972             vvdw             = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
973                                           _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
974             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
975
976             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
977
978             /* Update potential sum for this i atom from the interaction with this j atom. */
979             vvdw             = _mm256_and_ps(vvdw,cutoff_mask);
980             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
981             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
982
983             fscal            = fvdw;
984
985             fscal            = _mm256_and_ps(fscal,cutoff_mask);
986
987             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
988
989             /* Calculate temporary vectorial force */
990             tx               = _mm256_mul_ps(fscal,dx00);
991             ty               = _mm256_mul_ps(fscal,dy00);
992             tz               = _mm256_mul_ps(fscal,dz00);
993
994             /* Update vectorial force */
995             fix0             = _mm256_add_ps(fix0,tx);
996             fiy0             = _mm256_add_ps(fiy0,ty);
997             fiz0             = _mm256_add_ps(fiz0,tz);
998
999             fjx0             = _mm256_add_ps(fjx0,tx);
1000             fjy0             = _mm256_add_ps(fjy0,ty);
1001             fjz0             = _mm256_add_ps(fjz0,tz);
1002
1003             }
1004
1005             /**************************
1006              * CALCULATE INTERACTIONS *
1007              **************************/
1008
1009             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1010             {
1011
1012             r11              = _mm256_mul_ps(rsq11,rinv11);
1013             r11              = _mm256_andnot_ps(dummy_mask,r11);
1014
1015             /* EWALD ELECTROSTATICS */
1016             
1017             /* Analytical PME correction */
1018             zeta2            = _mm256_mul_ps(beta2,rsq11);
1019             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1020             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1021             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1022             felec            = _mm256_mul_ps(qq11,felec);
1023             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1024             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1025             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv11,sh_ewald),pmecorrV);
1026             velec            = _mm256_mul_ps(qq11,velec);
1027             
1028             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1029
1030             /* Update potential sum for this i atom from the interaction with this j atom. */
1031             velec            = _mm256_and_ps(velec,cutoff_mask);
1032             velec            = _mm256_andnot_ps(dummy_mask,velec);
1033             velecsum         = _mm256_add_ps(velecsum,velec);
1034
1035             fscal            = felec;
1036
1037             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1038
1039             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1040
1041             /* Calculate temporary vectorial force */
1042             tx               = _mm256_mul_ps(fscal,dx11);
1043             ty               = _mm256_mul_ps(fscal,dy11);
1044             tz               = _mm256_mul_ps(fscal,dz11);
1045
1046             /* Update vectorial force */
1047             fix1             = _mm256_add_ps(fix1,tx);
1048             fiy1             = _mm256_add_ps(fiy1,ty);
1049             fiz1             = _mm256_add_ps(fiz1,tz);
1050
1051             fjx1             = _mm256_add_ps(fjx1,tx);
1052             fjy1             = _mm256_add_ps(fjy1,ty);
1053             fjz1             = _mm256_add_ps(fjz1,tz);
1054
1055             }
1056
1057             /**************************
1058              * CALCULATE INTERACTIONS *
1059              **************************/
1060
1061             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1062             {
1063
1064             r12              = _mm256_mul_ps(rsq12,rinv12);
1065             r12              = _mm256_andnot_ps(dummy_mask,r12);
1066
1067             /* EWALD ELECTROSTATICS */
1068             
1069             /* Analytical PME correction */
1070             zeta2            = _mm256_mul_ps(beta2,rsq12);
1071             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1072             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1073             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1074             felec            = _mm256_mul_ps(qq12,felec);
1075             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1076             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1077             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv12,sh_ewald),pmecorrV);
1078             velec            = _mm256_mul_ps(qq12,velec);
1079             
1080             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1081
1082             /* Update potential sum for this i atom from the interaction with this j atom. */
1083             velec            = _mm256_and_ps(velec,cutoff_mask);
1084             velec            = _mm256_andnot_ps(dummy_mask,velec);
1085             velecsum         = _mm256_add_ps(velecsum,velec);
1086
1087             fscal            = felec;
1088
1089             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1090
1091             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1092
1093             /* Calculate temporary vectorial force */
1094             tx               = _mm256_mul_ps(fscal,dx12);
1095             ty               = _mm256_mul_ps(fscal,dy12);
1096             tz               = _mm256_mul_ps(fscal,dz12);
1097
1098             /* Update vectorial force */
1099             fix1             = _mm256_add_ps(fix1,tx);
1100             fiy1             = _mm256_add_ps(fiy1,ty);
1101             fiz1             = _mm256_add_ps(fiz1,tz);
1102
1103             fjx2             = _mm256_add_ps(fjx2,tx);
1104             fjy2             = _mm256_add_ps(fjy2,ty);
1105             fjz2             = _mm256_add_ps(fjz2,tz);
1106
1107             }
1108
1109             /**************************
1110              * CALCULATE INTERACTIONS *
1111              **************************/
1112
1113             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1114             {
1115
1116             r13              = _mm256_mul_ps(rsq13,rinv13);
1117             r13              = _mm256_andnot_ps(dummy_mask,r13);
1118
1119             /* EWALD ELECTROSTATICS */
1120             
1121             /* Analytical PME correction */
1122             zeta2            = _mm256_mul_ps(beta2,rsq13);
1123             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
1124             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1125             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1126             felec            = _mm256_mul_ps(qq13,felec);
1127             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1128             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1129             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv13,sh_ewald),pmecorrV);
1130             velec            = _mm256_mul_ps(qq13,velec);
1131             
1132             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1133
1134             /* Update potential sum for this i atom from the interaction with this j atom. */
1135             velec            = _mm256_and_ps(velec,cutoff_mask);
1136             velec            = _mm256_andnot_ps(dummy_mask,velec);
1137             velecsum         = _mm256_add_ps(velecsum,velec);
1138
1139             fscal            = felec;
1140
1141             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1142
1143             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1144
1145             /* Calculate temporary vectorial force */
1146             tx               = _mm256_mul_ps(fscal,dx13);
1147             ty               = _mm256_mul_ps(fscal,dy13);
1148             tz               = _mm256_mul_ps(fscal,dz13);
1149
1150             /* Update vectorial force */
1151             fix1             = _mm256_add_ps(fix1,tx);
1152             fiy1             = _mm256_add_ps(fiy1,ty);
1153             fiz1             = _mm256_add_ps(fiz1,tz);
1154
1155             fjx3             = _mm256_add_ps(fjx3,tx);
1156             fjy3             = _mm256_add_ps(fjy3,ty);
1157             fjz3             = _mm256_add_ps(fjz3,tz);
1158
1159             }
1160
1161             /**************************
1162              * CALCULATE INTERACTIONS *
1163              **************************/
1164
1165             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1166             {
1167
1168             r21              = _mm256_mul_ps(rsq21,rinv21);
1169             r21              = _mm256_andnot_ps(dummy_mask,r21);
1170
1171             /* EWALD ELECTROSTATICS */
1172             
1173             /* Analytical PME correction */
1174             zeta2            = _mm256_mul_ps(beta2,rsq21);
1175             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1176             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1177             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1178             felec            = _mm256_mul_ps(qq21,felec);
1179             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1180             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1181             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv21,sh_ewald),pmecorrV);
1182             velec            = _mm256_mul_ps(qq21,velec);
1183             
1184             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1185
1186             /* Update potential sum for this i atom from the interaction with this j atom. */
1187             velec            = _mm256_and_ps(velec,cutoff_mask);
1188             velec            = _mm256_andnot_ps(dummy_mask,velec);
1189             velecsum         = _mm256_add_ps(velecsum,velec);
1190
1191             fscal            = felec;
1192
1193             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1194
1195             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1196
1197             /* Calculate temporary vectorial force */
1198             tx               = _mm256_mul_ps(fscal,dx21);
1199             ty               = _mm256_mul_ps(fscal,dy21);
1200             tz               = _mm256_mul_ps(fscal,dz21);
1201
1202             /* Update vectorial force */
1203             fix2             = _mm256_add_ps(fix2,tx);
1204             fiy2             = _mm256_add_ps(fiy2,ty);
1205             fiz2             = _mm256_add_ps(fiz2,tz);
1206
1207             fjx1             = _mm256_add_ps(fjx1,tx);
1208             fjy1             = _mm256_add_ps(fjy1,ty);
1209             fjz1             = _mm256_add_ps(fjz1,tz);
1210
1211             }
1212
1213             /**************************
1214              * CALCULATE INTERACTIONS *
1215              **************************/
1216
1217             if (gmx_mm256_any_lt(rsq22,rcutoff2))
1218             {
1219
1220             r22              = _mm256_mul_ps(rsq22,rinv22);
1221             r22              = _mm256_andnot_ps(dummy_mask,r22);
1222
1223             /* EWALD ELECTROSTATICS */
1224             
1225             /* Analytical PME correction */
1226             zeta2            = _mm256_mul_ps(beta2,rsq22);
1227             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1228             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1229             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1230             felec            = _mm256_mul_ps(qq22,felec);
1231             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1232             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1233             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv22,sh_ewald),pmecorrV);
1234             velec            = _mm256_mul_ps(qq22,velec);
1235             
1236             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1237
1238             /* Update potential sum for this i atom from the interaction with this j atom. */
1239             velec            = _mm256_and_ps(velec,cutoff_mask);
1240             velec            = _mm256_andnot_ps(dummy_mask,velec);
1241             velecsum         = _mm256_add_ps(velecsum,velec);
1242
1243             fscal            = felec;
1244
1245             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1246
1247             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1248
1249             /* Calculate temporary vectorial force */
1250             tx               = _mm256_mul_ps(fscal,dx22);
1251             ty               = _mm256_mul_ps(fscal,dy22);
1252             tz               = _mm256_mul_ps(fscal,dz22);
1253
1254             /* Update vectorial force */
1255             fix2             = _mm256_add_ps(fix2,tx);
1256             fiy2             = _mm256_add_ps(fiy2,ty);
1257             fiz2             = _mm256_add_ps(fiz2,tz);
1258
1259             fjx2             = _mm256_add_ps(fjx2,tx);
1260             fjy2             = _mm256_add_ps(fjy2,ty);
1261             fjz2             = _mm256_add_ps(fjz2,tz);
1262
1263             }
1264
1265             /**************************
1266              * CALCULATE INTERACTIONS *
1267              **************************/
1268
1269             if (gmx_mm256_any_lt(rsq23,rcutoff2))
1270             {
1271
1272             r23              = _mm256_mul_ps(rsq23,rinv23);
1273             r23              = _mm256_andnot_ps(dummy_mask,r23);
1274
1275             /* EWALD ELECTROSTATICS */
1276             
1277             /* Analytical PME correction */
1278             zeta2            = _mm256_mul_ps(beta2,rsq23);
1279             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
1280             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1281             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1282             felec            = _mm256_mul_ps(qq23,felec);
1283             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1284             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1285             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv23,sh_ewald),pmecorrV);
1286             velec            = _mm256_mul_ps(qq23,velec);
1287             
1288             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
1289
1290             /* Update potential sum for this i atom from the interaction with this j atom. */
1291             velec            = _mm256_and_ps(velec,cutoff_mask);
1292             velec            = _mm256_andnot_ps(dummy_mask,velec);
1293             velecsum         = _mm256_add_ps(velecsum,velec);
1294
1295             fscal            = felec;
1296
1297             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1298
1299             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1300
1301             /* Calculate temporary vectorial force */
1302             tx               = _mm256_mul_ps(fscal,dx23);
1303             ty               = _mm256_mul_ps(fscal,dy23);
1304             tz               = _mm256_mul_ps(fscal,dz23);
1305
1306             /* Update vectorial force */
1307             fix2             = _mm256_add_ps(fix2,tx);
1308             fiy2             = _mm256_add_ps(fiy2,ty);
1309             fiz2             = _mm256_add_ps(fiz2,tz);
1310
1311             fjx3             = _mm256_add_ps(fjx3,tx);
1312             fjy3             = _mm256_add_ps(fjy3,ty);
1313             fjz3             = _mm256_add_ps(fjz3,tz);
1314
1315             }
1316
1317             /**************************
1318              * CALCULATE INTERACTIONS *
1319              **************************/
1320
1321             if (gmx_mm256_any_lt(rsq31,rcutoff2))
1322             {
1323
1324             r31              = _mm256_mul_ps(rsq31,rinv31);
1325             r31              = _mm256_andnot_ps(dummy_mask,r31);
1326
1327             /* EWALD ELECTROSTATICS */
1328             
1329             /* Analytical PME correction */
1330             zeta2            = _mm256_mul_ps(beta2,rsq31);
1331             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
1332             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1333             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1334             felec            = _mm256_mul_ps(qq31,felec);
1335             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1336             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1337             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv31,sh_ewald),pmecorrV);
1338             velec            = _mm256_mul_ps(qq31,velec);
1339             
1340             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
1341
1342             /* Update potential sum for this i atom from the interaction with this j atom. */
1343             velec            = _mm256_and_ps(velec,cutoff_mask);
1344             velec            = _mm256_andnot_ps(dummy_mask,velec);
1345             velecsum         = _mm256_add_ps(velecsum,velec);
1346
1347             fscal            = felec;
1348
1349             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1350
1351             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1352
1353             /* Calculate temporary vectorial force */
1354             tx               = _mm256_mul_ps(fscal,dx31);
1355             ty               = _mm256_mul_ps(fscal,dy31);
1356             tz               = _mm256_mul_ps(fscal,dz31);
1357
1358             /* Update vectorial force */
1359             fix3             = _mm256_add_ps(fix3,tx);
1360             fiy3             = _mm256_add_ps(fiy3,ty);
1361             fiz3             = _mm256_add_ps(fiz3,tz);
1362
1363             fjx1             = _mm256_add_ps(fjx1,tx);
1364             fjy1             = _mm256_add_ps(fjy1,ty);
1365             fjz1             = _mm256_add_ps(fjz1,tz);
1366
1367             }
1368
1369             /**************************
1370              * CALCULATE INTERACTIONS *
1371              **************************/
1372
1373             if (gmx_mm256_any_lt(rsq32,rcutoff2))
1374             {
1375
1376             r32              = _mm256_mul_ps(rsq32,rinv32);
1377             r32              = _mm256_andnot_ps(dummy_mask,r32);
1378
1379             /* EWALD ELECTROSTATICS */
1380             
1381             /* Analytical PME correction */
1382             zeta2            = _mm256_mul_ps(beta2,rsq32);
1383             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
1384             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1385             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1386             felec            = _mm256_mul_ps(qq32,felec);
1387             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1388             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1389             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv32,sh_ewald),pmecorrV);
1390             velec            = _mm256_mul_ps(qq32,velec);
1391             
1392             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
1393
1394             /* Update potential sum for this i atom from the interaction with this j atom. */
1395             velec            = _mm256_and_ps(velec,cutoff_mask);
1396             velec            = _mm256_andnot_ps(dummy_mask,velec);
1397             velecsum         = _mm256_add_ps(velecsum,velec);
1398
1399             fscal            = felec;
1400
1401             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1402
1403             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1404
1405             /* Calculate temporary vectorial force */
1406             tx               = _mm256_mul_ps(fscal,dx32);
1407             ty               = _mm256_mul_ps(fscal,dy32);
1408             tz               = _mm256_mul_ps(fscal,dz32);
1409
1410             /* Update vectorial force */
1411             fix3             = _mm256_add_ps(fix3,tx);
1412             fiy3             = _mm256_add_ps(fiy3,ty);
1413             fiz3             = _mm256_add_ps(fiz3,tz);
1414
1415             fjx2             = _mm256_add_ps(fjx2,tx);
1416             fjy2             = _mm256_add_ps(fjy2,ty);
1417             fjz2             = _mm256_add_ps(fjz2,tz);
1418
1419             }
1420
1421             /**************************
1422              * CALCULATE INTERACTIONS *
1423              **************************/
1424
1425             if (gmx_mm256_any_lt(rsq33,rcutoff2))
1426             {
1427
1428             r33              = _mm256_mul_ps(rsq33,rinv33);
1429             r33              = _mm256_andnot_ps(dummy_mask,r33);
1430
1431             /* EWALD ELECTROSTATICS */
1432             
1433             /* Analytical PME correction */
1434             zeta2            = _mm256_mul_ps(beta2,rsq33);
1435             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
1436             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1437             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1438             felec            = _mm256_mul_ps(qq33,felec);
1439             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
1440             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1441             velec            = _mm256_sub_ps(_mm256_sub_ps(rinv33,sh_ewald),pmecorrV);
1442             velec            = _mm256_mul_ps(qq33,velec);
1443             
1444             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
1445
1446             /* Update potential sum for this i atom from the interaction with this j atom. */
1447             velec            = _mm256_and_ps(velec,cutoff_mask);
1448             velec            = _mm256_andnot_ps(dummy_mask,velec);
1449             velecsum         = _mm256_add_ps(velecsum,velec);
1450
1451             fscal            = felec;
1452
1453             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1454
1455             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1456
1457             /* Calculate temporary vectorial force */
1458             tx               = _mm256_mul_ps(fscal,dx33);
1459             ty               = _mm256_mul_ps(fscal,dy33);
1460             tz               = _mm256_mul_ps(fscal,dz33);
1461
1462             /* Update vectorial force */
1463             fix3             = _mm256_add_ps(fix3,tx);
1464             fiy3             = _mm256_add_ps(fiy3,ty);
1465             fiz3             = _mm256_add_ps(fiz3,tz);
1466
1467             fjx3             = _mm256_add_ps(fjx3,tx);
1468             fjy3             = _mm256_add_ps(fjy3,ty);
1469             fjz3             = _mm256_add_ps(fjz3,tz);
1470
1471             }
1472
1473             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1474             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1475             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1476             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1477             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1478             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1479             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1480             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1481
1482             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1483                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1484                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1485
1486             /* Inner loop uses 1034 flops */
1487         }
1488
1489         /* End of innermost loop */
1490
1491         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1492                                                  f+i_coord_offset,fshift+i_shift_offset);
1493
1494         ggid                        = gid[iidx];
1495         /* Update potential energies */
1496         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1497         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1498
1499         /* Increment number of inner iterations */
1500         inneriter                  += j_index_end - j_index_start;
1501
1502         /* Outer loop uses 26 flops */
1503     }
1504
1505     /* Increment number of outer iterations */
1506     outeriter        += nri;
1507
1508     /* Update outer/inner flops */
1509
1510     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*1034);
1511 }
1512 /*
1513  * Gromacs nonbonded kernel:   nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_single
1514  * Electrostatics interaction: Ewald
1515  * VdW interaction:            LennardJones
1516  * Geometry:                   Water4-Water4
1517  * Calculate force/pot:        Force
1518  */
1519 void
1520 nb_kernel_ElecEwSh_VdwLJSh_GeomW4W4_F_avx_256_single
1521                     (t_nblist                    * gmx_restrict       nlist,
1522                      rvec                        * gmx_restrict          xx,
1523                      rvec                        * gmx_restrict          ff,
1524                      t_forcerec                  * gmx_restrict          fr,
1525                      t_mdatoms                   * gmx_restrict     mdatoms,
1526                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1527                      t_nrnb                      * gmx_restrict        nrnb)
1528 {
1529     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1530      * just 0 for non-waters.
1531      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1532      * jnr indices corresponding to data put in the four positions in the SIMD register.
1533      */
1534     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1535     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1536     int              jnrA,jnrB,jnrC,jnrD;
1537     int              jnrE,jnrF,jnrG,jnrH;
1538     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1539     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1540     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1541     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1542     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1543     real             rcutoff_scalar;
1544     real             *shiftvec,*fshift,*x,*f;
1545     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1546     real             scratch[4*DIM];
1547     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1548     real *           vdwioffsetptr0;
1549     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1550     real *           vdwioffsetptr1;
1551     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1552     real *           vdwioffsetptr2;
1553     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1554     real *           vdwioffsetptr3;
1555     __m256           ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1556     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1557     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1558     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1559     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1560     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1561     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1562     int              vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1563     __m256           jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1564     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1565     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1566     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1567     __m256           dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1568     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1569     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1570     __m256           dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1571     __m256           dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1572     __m256           dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1573     __m256           dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1574     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1575     real             *charge;
1576     int              nvdwtype;
1577     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1578     int              *vdwtype;
1579     real             *vdwparam;
1580     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1581     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1582     __m256i          ewitab;
1583     __m128i          ewitab_lo,ewitab_hi;
1584     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1585     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1586     real             *ewtab;
1587     __m256           dummy_mask,cutoff_mask;
1588     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1589     __m256           one     = _mm256_set1_ps(1.0);
1590     __m256           two     = _mm256_set1_ps(2.0);
1591     x                = xx[0];
1592     f                = ff[0];
1593
1594     nri              = nlist->nri;
1595     iinr             = nlist->iinr;
1596     jindex           = nlist->jindex;
1597     jjnr             = nlist->jjnr;
1598     shiftidx         = nlist->shift;
1599     gid              = nlist->gid;
1600     shiftvec         = fr->shift_vec[0];
1601     fshift           = fr->fshift[0];
1602     facel            = _mm256_set1_ps(fr->epsfac);
1603     charge           = mdatoms->chargeA;
1604     nvdwtype         = fr->ntype;
1605     vdwparam         = fr->nbfp;
1606     vdwtype          = mdatoms->typeA;
1607
1608     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1609     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1610     beta2            = _mm256_mul_ps(beta,beta);
1611     beta3            = _mm256_mul_ps(beta,beta2);
1612
1613     ewtab            = fr->ic->tabq_coul_F;
1614     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1615     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1616
1617     /* Setup water-specific parameters */
1618     inr              = nlist->iinr[0];
1619     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1620     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1621     iq3              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1622     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1623
1624     jq1              = _mm256_set1_ps(charge[inr+1]);
1625     jq2              = _mm256_set1_ps(charge[inr+2]);
1626     jq3              = _mm256_set1_ps(charge[inr+3]);
1627     vdwjidx0A        = 2*vdwtype[inr+0];
1628     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1629     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1630     qq11             = _mm256_mul_ps(iq1,jq1);
1631     qq12             = _mm256_mul_ps(iq1,jq2);
1632     qq13             = _mm256_mul_ps(iq1,jq3);
1633     qq21             = _mm256_mul_ps(iq2,jq1);
1634     qq22             = _mm256_mul_ps(iq2,jq2);
1635     qq23             = _mm256_mul_ps(iq2,jq3);
1636     qq31             = _mm256_mul_ps(iq3,jq1);
1637     qq32             = _mm256_mul_ps(iq3,jq2);
1638     qq33             = _mm256_mul_ps(iq3,jq3);
1639
1640     /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1641     rcutoff_scalar   = fr->rcoulomb;
1642     rcutoff          = _mm256_set1_ps(rcutoff_scalar);
1643     rcutoff2         = _mm256_mul_ps(rcutoff,rcutoff);
1644
1645     sh_vdw_invrcut6  = _mm256_set1_ps(fr->ic->sh_invrc6);
1646     rvdw             = _mm256_set1_ps(fr->rvdw);
1647
1648     /* Avoid stupid compiler warnings */
1649     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1650     j_coord_offsetA = 0;
1651     j_coord_offsetB = 0;
1652     j_coord_offsetC = 0;
1653     j_coord_offsetD = 0;
1654     j_coord_offsetE = 0;
1655     j_coord_offsetF = 0;
1656     j_coord_offsetG = 0;
1657     j_coord_offsetH = 0;
1658
1659     outeriter        = 0;
1660     inneriter        = 0;
1661
1662     for(iidx=0;iidx<4*DIM;iidx++)
1663     {
1664         scratch[iidx] = 0.0;
1665     }
1666
1667     /* Start outer loop over neighborlists */
1668     for(iidx=0; iidx<nri; iidx++)
1669     {
1670         /* Load shift vector for this list */
1671         i_shift_offset   = DIM*shiftidx[iidx];
1672
1673         /* Load limits for loop over neighbors */
1674         j_index_start    = jindex[iidx];
1675         j_index_end      = jindex[iidx+1];
1676
1677         /* Get outer coordinate index */
1678         inr              = iinr[iidx];
1679         i_coord_offset   = DIM*inr;
1680
1681         /* Load i particle coords and add shift vector */
1682         gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1683                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1684
1685         fix0             = _mm256_setzero_ps();
1686         fiy0             = _mm256_setzero_ps();
1687         fiz0             = _mm256_setzero_ps();
1688         fix1             = _mm256_setzero_ps();
1689         fiy1             = _mm256_setzero_ps();
1690         fiz1             = _mm256_setzero_ps();
1691         fix2             = _mm256_setzero_ps();
1692         fiy2             = _mm256_setzero_ps();
1693         fiz2             = _mm256_setzero_ps();
1694         fix3             = _mm256_setzero_ps();
1695         fiy3             = _mm256_setzero_ps();
1696         fiz3             = _mm256_setzero_ps();
1697
1698         /* Start inner kernel loop */
1699         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1700         {
1701
1702             /* Get j neighbor index, and coordinate index */
1703             jnrA             = jjnr[jidx];
1704             jnrB             = jjnr[jidx+1];
1705             jnrC             = jjnr[jidx+2];
1706             jnrD             = jjnr[jidx+3];
1707             jnrE             = jjnr[jidx+4];
1708             jnrF             = jjnr[jidx+5];
1709             jnrG             = jjnr[jidx+6];
1710             jnrH             = jjnr[jidx+7];
1711             j_coord_offsetA  = DIM*jnrA;
1712             j_coord_offsetB  = DIM*jnrB;
1713             j_coord_offsetC  = DIM*jnrC;
1714             j_coord_offsetD  = DIM*jnrD;
1715             j_coord_offsetE  = DIM*jnrE;
1716             j_coord_offsetF  = DIM*jnrF;
1717             j_coord_offsetG  = DIM*jnrG;
1718             j_coord_offsetH  = DIM*jnrH;
1719
1720             /* load j atom coordinates */
1721             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1722                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1723                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1724                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1725                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1726                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
1727
1728             /* Calculate displacement vector */
1729             dx00             = _mm256_sub_ps(ix0,jx0);
1730             dy00             = _mm256_sub_ps(iy0,jy0);
1731             dz00             = _mm256_sub_ps(iz0,jz0);
1732             dx11             = _mm256_sub_ps(ix1,jx1);
1733             dy11             = _mm256_sub_ps(iy1,jy1);
1734             dz11             = _mm256_sub_ps(iz1,jz1);
1735             dx12             = _mm256_sub_ps(ix1,jx2);
1736             dy12             = _mm256_sub_ps(iy1,jy2);
1737             dz12             = _mm256_sub_ps(iz1,jz2);
1738             dx13             = _mm256_sub_ps(ix1,jx3);
1739             dy13             = _mm256_sub_ps(iy1,jy3);
1740             dz13             = _mm256_sub_ps(iz1,jz3);
1741             dx21             = _mm256_sub_ps(ix2,jx1);
1742             dy21             = _mm256_sub_ps(iy2,jy1);
1743             dz21             = _mm256_sub_ps(iz2,jz1);
1744             dx22             = _mm256_sub_ps(ix2,jx2);
1745             dy22             = _mm256_sub_ps(iy2,jy2);
1746             dz22             = _mm256_sub_ps(iz2,jz2);
1747             dx23             = _mm256_sub_ps(ix2,jx3);
1748             dy23             = _mm256_sub_ps(iy2,jy3);
1749             dz23             = _mm256_sub_ps(iz2,jz3);
1750             dx31             = _mm256_sub_ps(ix3,jx1);
1751             dy31             = _mm256_sub_ps(iy3,jy1);
1752             dz31             = _mm256_sub_ps(iz3,jz1);
1753             dx32             = _mm256_sub_ps(ix3,jx2);
1754             dy32             = _mm256_sub_ps(iy3,jy2);
1755             dz32             = _mm256_sub_ps(iz3,jz2);
1756             dx33             = _mm256_sub_ps(ix3,jx3);
1757             dy33             = _mm256_sub_ps(iy3,jy3);
1758             dz33             = _mm256_sub_ps(iz3,jz3);
1759
1760             /* Calculate squared distance and things based on it */
1761             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1762             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1763             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1764             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1765             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1766             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1767             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1768             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1769             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1770             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1771
1772             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1773             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1774             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
1775             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1776             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1777             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
1778             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
1779             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
1780             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
1781
1782             rinvsq00         = gmx_mm256_inv_ps(rsq00);
1783             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1784             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1785             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
1786             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1787             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1788             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
1789             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
1790             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
1791             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
1792
1793             fjx0             = _mm256_setzero_ps();
1794             fjy0             = _mm256_setzero_ps();
1795             fjz0             = _mm256_setzero_ps();
1796             fjx1             = _mm256_setzero_ps();
1797             fjy1             = _mm256_setzero_ps();
1798             fjz1             = _mm256_setzero_ps();
1799             fjx2             = _mm256_setzero_ps();
1800             fjy2             = _mm256_setzero_ps();
1801             fjz2             = _mm256_setzero_ps();
1802             fjx3             = _mm256_setzero_ps();
1803             fjy3             = _mm256_setzero_ps();
1804             fjz3             = _mm256_setzero_ps();
1805
1806             /**************************
1807              * CALCULATE INTERACTIONS *
1808              **************************/
1809
1810             if (gmx_mm256_any_lt(rsq00,rcutoff2))
1811             {
1812
1813             /* LENNARD-JONES DISPERSION/REPULSION */
1814
1815             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1816             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1817
1818             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1819
1820             fscal            = fvdw;
1821
1822             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1823
1824             /* Calculate temporary vectorial force */
1825             tx               = _mm256_mul_ps(fscal,dx00);
1826             ty               = _mm256_mul_ps(fscal,dy00);
1827             tz               = _mm256_mul_ps(fscal,dz00);
1828
1829             /* Update vectorial force */
1830             fix0             = _mm256_add_ps(fix0,tx);
1831             fiy0             = _mm256_add_ps(fiy0,ty);
1832             fiz0             = _mm256_add_ps(fiz0,tz);
1833
1834             fjx0             = _mm256_add_ps(fjx0,tx);
1835             fjy0             = _mm256_add_ps(fjy0,ty);
1836             fjz0             = _mm256_add_ps(fjz0,tz);
1837
1838             }
1839
1840             /**************************
1841              * CALCULATE INTERACTIONS *
1842              **************************/
1843
1844             if (gmx_mm256_any_lt(rsq11,rcutoff2))
1845             {
1846
1847             r11              = _mm256_mul_ps(rsq11,rinv11);
1848
1849             /* EWALD ELECTROSTATICS */
1850             
1851             /* Analytical PME correction */
1852             zeta2            = _mm256_mul_ps(beta2,rsq11);
1853             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1854             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1855             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1856             felec            = _mm256_mul_ps(qq11,felec);
1857             
1858             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1859
1860             fscal            = felec;
1861
1862             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1863
1864             /* Calculate temporary vectorial force */
1865             tx               = _mm256_mul_ps(fscal,dx11);
1866             ty               = _mm256_mul_ps(fscal,dy11);
1867             tz               = _mm256_mul_ps(fscal,dz11);
1868
1869             /* Update vectorial force */
1870             fix1             = _mm256_add_ps(fix1,tx);
1871             fiy1             = _mm256_add_ps(fiy1,ty);
1872             fiz1             = _mm256_add_ps(fiz1,tz);
1873
1874             fjx1             = _mm256_add_ps(fjx1,tx);
1875             fjy1             = _mm256_add_ps(fjy1,ty);
1876             fjz1             = _mm256_add_ps(fjz1,tz);
1877
1878             }
1879
1880             /**************************
1881              * CALCULATE INTERACTIONS *
1882              **************************/
1883
1884             if (gmx_mm256_any_lt(rsq12,rcutoff2))
1885             {
1886
1887             r12              = _mm256_mul_ps(rsq12,rinv12);
1888
1889             /* EWALD ELECTROSTATICS */
1890             
1891             /* Analytical PME correction */
1892             zeta2            = _mm256_mul_ps(beta2,rsq12);
1893             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1894             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1895             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1896             felec            = _mm256_mul_ps(qq12,felec);
1897             
1898             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1899
1900             fscal            = felec;
1901
1902             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1903
1904             /* Calculate temporary vectorial force */
1905             tx               = _mm256_mul_ps(fscal,dx12);
1906             ty               = _mm256_mul_ps(fscal,dy12);
1907             tz               = _mm256_mul_ps(fscal,dz12);
1908
1909             /* Update vectorial force */
1910             fix1             = _mm256_add_ps(fix1,tx);
1911             fiy1             = _mm256_add_ps(fiy1,ty);
1912             fiz1             = _mm256_add_ps(fiz1,tz);
1913
1914             fjx2             = _mm256_add_ps(fjx2,tx);
1915             fjy2             = _mm256_add_ps(fjy2,ty);
1916             fjz2             = _mm256_add_ps(fjz2,tz);
1917
1918             }
1919
1920             /**************************
1921              * CALCULATE INTERACTIONS *
1922              **************************/
1923
1924             if (gmx_mm256_any_lt(rsq13,rcutoff2))
1925             {
1926
1927             r13              = _mm256_mul_ps(rsq13,rinv13);
1928
1929             /* EWALD ELECTROSTATICS */
1930             
1931             /* Analytical PME correction */
1932             zeta2            = _mm256_mul_ps(beta2,rsq13);
1933             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
1934             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1935             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1936             felec            = _mm256_mul_ps(qq13,felec);
1937             
1938             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
1939
1940             fscal            = felec;
1941
1942             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1943
1944             /* Calculate temporary vectorial force */
1945             tx               = _mm256_mul_ps(fscal,dx13);
1946             ty               = _mm256_mul_ps(fscal,dy13);
1947             tz               = _mm256_mul_ps(fscal,dz13);
1948
1949             /* Update vectorial force */
1950             fix1             = _mm256_add_ps(fix1,tx);
1951             fiy1             = _mm256_add_ps(fiy1,ty);
1952             fiz1             = _mm256_add_ps(fiz1,tz);
1953
1954             fjx3             = _mm256_add_ps(fjx3,tx);
1955             fjy3             = _mm256_add_ps(fjy3,ty);
1956             fjz3             = _mm256_add_ps(fjz3,tz);
1957
1958             }
1959
1960             /**************************
1961              * CALCULATE INTERACTIONS *
1962              **************************/
1963
1964             if (gmx_mm256_any_lt(rsq21,rcutoff2))
1965             {
1966
1967             r21              = _mm256_mul_ps(rsq21,rinv21);
1968
1969             /* EWALD ELECTROSTATICS */
1970             
1971             /* Analytical PME correction */
1972             zeta2            = _mm256_mul_ps(beta2,rsq21);
1973             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1974             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1975             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1976             felec            = _mm256_mul_ps(qq21,felec);
1977             
1978             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1979
1980             fscal            = felec;
1981
1982             fscal            = _mm256_and_ps(fscal,cutoff_mask);
1983
1984             /* Calculate temporary vectorial force */
1985             tx               = _mm256_mul_ps(fscal,dx21);
1986             ty               = _mm256_mul_ps(fscal,dy21);
1987             tz               = _mm256_mul_ps(fscal,dz21);
1988
1989             /* Update vectorial force */
1990             fix2             = _mm256_add_ps(fix2,tx);
1991             fiy2             = _mm256_add_ps(fiy2,ty);
1992             fiz2             = _mm256_add_ps(fiz2,tz);
1993
1994             fjx1             = _mm256_add_ps(fjx1,tx);
1995             fjy1             = _mm256_add_ps(fjy1,ty);
1996             fjz1             = _mm256_add_ps(fjz1,tz);
1997
1998             }
1999
2000             /**************************
2001              * CALCULATE INTERACTIONS *
2002              **************************/
2003
2004             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2005             {
2006
2007             r22              = _mm256_mul_ps(rsq22,rinv22);
2008
2009             /* EWALD ELECTROSTATICS */
2010             
2011             /* Analytical PME correction */
2012             zeta2            = _mm256_mul_ps(beta2,rsq22);
2013             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2014             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2015             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2016             felec            = _mm256_mul_ps(qq22,felec);
2017             
2018             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2019
2020             fscal            = felec;
2021
2022             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2023
2024             /* Calculate temporary vectorial force */
2025             tx               = _mm256_mul_ps(fscal,dx22);
2026             ty               = _mm256_mul_ps(fscal,dy22);
2027             tz               = _mm256_mul_ps(fscal,dz22);
2028
2029             /* Update vectorial force */
2030             fix2             = _mm256_add_ps(fix2,tx);
2031             fiy2             = _mm256_add_ps(fiy2,ty);
2032             fiz2             = _mm256_add_ps(fiz2,tz);
2033
2034             fjx2             = _mm256_add_ps(fjx2,tx);
2035             fjy2             = _mm256_add_ps(fjy2,ty);
2036             fjz2             = _mm256_add_ps(fjz2,tz);
2037
2038             }
2039
2040             /**************************
2041              * CALCULATE INTERACTIONS *
2042              **************************/
2043
2044             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2045             {
2046
2047             r23              = _mm256_mul_ps(rsq23,rinv23);
2048
2049             /* EWALD ELECTROSTATICS */
2050             
2051             /* Analytical PME correction */
2052             zeta2            = _mm256_mul_ps(beta2,rsq23);
2053             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
2054             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2055             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2056             felec            = _mm256_mul_ps(qq23,felec);
2057             
2058             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2059
2060             fscal            = felec;
2061
2062             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2063
2064             /* Calculate temporary vectorial force */
2065             tx               = _mm256_mul_ps(fscal,dx23);
2066             ty               = _mm256_mul_ps(fscal,dy23);
2067             tz               = _mm256_mul_ps(fscal,dz23);
2068
2069             /* Update vectorial force */
2070             fix2             = _mm256_add_ps(fix2,tx);
2071             fiy2             = _mm256_add_ps(fiy2,ty);
2072             fiz2             = _mm256_add_ps(fiz2,tz);
2073
2074             fjx3             = _mm256_add_ps(fjx3,tx);
2075             fjy3             = _mm256_add_ps(fjy3,ty);
2076             fjz3             = _mm256_add_ps(fjz3,tz);
2077
2078             }
2079
2080             /**************************
2081              * CALCULATE INTERACTIONS *
2082              **************************/
2083
2084             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2085             {
2086
2087             r31              = _mm256_mul_ps(rsq31,rinv31);
2088
2089             /* EWALD ELECTROSTATICS */
2090             
2091             /* Analytical PME correction */
2092             zeta2            = _mm256_mul_ps(beta2,rsq31);
2093             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
2094             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2095             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2096             felec            = _mm256_mul_ps(qq31,felec);
2097             
2098             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2099
2100             fscal            = felec;
2101
2102             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2103
2104             /* Calculate temporary vectorial force */
2105             tx               = _mm256_mul_ps(fscal,dx31);
2106             ty               = _mm256_mul_ps(fscal,dy31);
2107             tz               = _mm256_mul_ps(fscal,dz31);
2108
2109             /* Update vectorial force */
2110             fix3             = _mm256_add_ps(fix3,tx);
2111             fiy3             = _mm256_add_ps(fiy3,ty);
2112             fiz3             = _mm256_add_ps(fiz3,tz);
2113
2114             fjx1             = _mm256_add_ps(fjx1,tx);
2115             fjy1             = _mm256_add_ps(fjy1,ty);
2116             fjz1             = _mm256_add_ps(fjz1,tz);
2117
2118             }
2119
2120             /**************************
2121              * CALCULATE INTERACTIONS *
2122              **************************/
2123
2124             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2125             {
2126
2127             r32              = _mm256_mul_ps(rsq32,rinv32);
2128
2129             /* EWALD ELECTROSTATICS */
2130             
2131             /* Analytical PME correction */
2132             zeta2            = _mm256_mul_ps(beta2,rsq32);
2133             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
2134             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2135             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2136             felec            = _mm256_mul_ps(qq32,felec);
2137             
2138             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2139
2140             fscal            = felec;
2141
2142             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2143
2144             /* Calculate temporary vectorial force */
2145             tx               = _mm256_mul_ps(fscal,dx32);
2146             ty               = _mm256_mul_ps(fscal,dy32);
2147             tz               = _mm256_mul_ps(fscal,dz32);
2148
2149             /* Update vectorial force */
2150             fix3             = _mm256_add_ps(fix3,tx);
2151             fiy3             = _mm256_add_ps(fiy3,ty);
2152             fiz3             = _mm256_add_ps(fiz3,tz);
2153
2154             fjx2             = _mm256_add_ps(fjx2,tx);
2155             fjy2             = _mm256_add_ps(fjy2,ty);
2156             fjz2             = _mm256_add_ps(fjz2,tz);
2157
2158             }
2159
2160             /**************************
2161              * CALCULATE INTERACTIONS *
2162              **************************/
2163
2164             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2165             {
2166
2167             r33              = _mm256_mul_ps(rsq33,rinv33);
2168
2169             /* EWALD ELECTROSTATICS */
2170             
2171             /* Analytical PME correction */
2172             zeta2            = _mm256_mul_ps(beta2,rsq33);
2173             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
2174             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2175             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2176             felec            = _mm256_mul_ps(qq33,felec);
2177             
2178             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2179
2180             fscal            = felec;
2181
2182             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2183
2184             /* Calculate temporary vectorial force */
2185             tx               = _mm256_mul_ps(fscal,dx33);
2186             ty               = _mm256_mul_ps(fscal,dy33);
2187             tz               = _mm256_mul_ps(fscal,dz33);
2188
2189             /* Update vectorial force */
2190             fix3             = _mm256_add_ps(fix3,tx);
2191             fiy3             = _mm256_add_ps(fiy3,ty);
2192             fiz3             = _mm256_add_ps(fiz3,tz);
2193
2194             fjx3             = _mm256_add_ps(fjx3,tx);
2195             fjy3             = _mm256_add_ps(fjy3,ty);
2196             fjz3             = _mm256_add_ps(fjz3,tz);
2197
2198             }
2199
2200             fjptrA             = f+j_coord_offsetA;
2201             fjptrB             = f+j_coord_offsetB;
2202             fjptrC             = f+j_coord_offsetC;
2203             fjptrD             = f+j_coord_offsetD;
2204             fjptrE             = f+j_coord_offsetE;
2205             fjptrF             = f+j_coord_offsetF;
2206             fjptrG             = f+j_coord_offsetG;
2207             fjptrH             = f+j_coord_offsetH;
2208
2209             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2210                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2211                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2212
2213             /* Inner loop uses 564 flops */
2214         }
2215
2216         if(jidx<j_index_end)
2217         {
2218
2219             /* Get j neighbor index, and coordinate index */
2220             jnrlistA         = jjnr[jidx];
2221             jnrlistB         = jjnr[jidx+1];
2222             jnrlistC         = jjnr[jidx+2];
2223             jnrlistD         = jjnr[jidx+3];
2224             jnrlistE         = jjnr[jidx+4];
2225             jnrlistF         = jjnr[jidx+5];
2226             jnrlistG         = jjnr[jidx+6];
2227             jnrlistH         = jjnr[jidx+7];
2228             /* Sign of each element will be negative for non-real atoms.
2229              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2230              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2231              */
2232             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2233                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2234                                             
2235             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
2236             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
2237             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
2238             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
2239             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
2240             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
2241             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
2242             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
2243             j_coord_offsetA  = DIM*jnrA;
2244             j_coord_offsetB  = DIM*jnrB;
2245             j_coord_offsetC  = DIM*jnrC;
2246             j_coord_offsetD  = DIM*jnrD;
2247             j_coord_offsetE  = DIM*jnrE;
2248             j_coord_offsetF  = DIM*jnrF;
2249             j_coord_offsetG  = DIM*jnrG;
2250             j_coord_offsetH  = DIM*jnrH;
2251
2252             /* load j atom coordinates */
2253             gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2254                                                  x+j_coord_offsetC,x+j_coord_offsetD,
2255                                                  x+j_coord_offsetE,x+j_coord_offsetF,
2256                                                  x+j_coord_offsetG,x+j_coord_offsetH,
2257                                                  &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2258                                                  &jy2,&jz2,&jx3,&jy3,&jz3);
2259
2260             /* Calculate displacement vector */
2261             dx00             = _mm256_sub_ps(ix0,jx0);
2262             dy00             = _mm256_sub_ps(iy0,jy0);
2263             dz00             = _mm256_sub_ps(iz0,jz0);
2264             dx11             = _mm256_sub_ps(ix1,jx1);
2265             dy11             = _mm256_sub_ps(iy1,jy1);
2266             dz11             = _mm256_sub_ps(iz1,jz1);
2267             dx12             = _mm256_sub_ps(ix1,jx2);
2268             dy12             = _mm256_sub_ps(iy1,jy2);
2269             dz12             = _mm256_sub_ps(iz1,jz2);
2270             dx13             = _mm256_sub_ps(ix1,jx3);
2271             dy13             = _mm256_sub_ps(iy1,jy3);
2272             dz13             = _mm256_sub_ps(iz1,jz3);
2273             dx21             = _mm256_sub_ps(ix2,jx1);
2274             dy21             = _mm256_sub_ps(iy2,jy1);
2275             dz21             = _mm256_sub_ps(iz2,jz1);
2276             dx22             = _mm256_sub_ps(ix2,jx2);
2277             dy22             = _mm256_sub_ps(iy2,jy2);
2278             dz22             = _mm256_sub_ps(iz2,jz2);
2279             dx23             = _mm256_sub_ps(ix2,jx3);
2280             dy23             = _mm256_sub_ps(iy2,jy3);
2281             dz23             = _mm256_sub_ps(iz2,jz3);
2282             dx31             = _mm256_sub_ps(ix3,jx1);
2283             dy31             = _mm256_sub_ps(iy3,jy1);
2284             dz31             = _mm256_sub_ps(iz3,jz1);
2285             dx32             = _mm256_sub_ps(ix3,jx2);
2286             dy32             = _mm256_sub_ps(iy3,jy2);
2287             dz32             = _mm256_sub_ps(iz3,jz2);
2288             dx33             = _mm256_sub_ps(ix3,jx3);
2289             dy33             = _mm256_sub_ps(iy3,jy3);
2290             dz33             = _mm256_sub_ps(iz3,jz3);
2291
2292             /* Calculate squared distance and things based on it */
2293             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2294             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2295             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2296             rsq13            = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2297             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2298             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2299             rsq23            = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2300             rsq31            = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2301             rsq32            = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2302             rsq33            = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2303
2304             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
2305             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
2306             rinv13           = gmx_mm256_invsqrt_ps(rsq13);
2307             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
2308             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
2309             rinv23           = gmx_mm256_invsqrt_ps(rsq23);
2310             rinv31           = gmx_mm256_invsqrt_ps(rsq31);
2311             rinv32           = gmx_mm256_invsqrt_ps(rsq32);
2312             rinv33           = gmx_mm256_invsqrt_ps(rsq33);
2313
2314             rinvsq00         = gmx_mm256_inv_ps(rsq00);
2315             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
2316             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
2317             rinvsq13         = _mm256_mul_ps(rinv13,rinv13);
2318             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
2319             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
2320             rinvsq23         = _mm256_mul_ps(rinv23,rinv23);
2321             rinvsq31         = _mm256_mul_ps(rinv31,rinv31);
2322             rinvsq32         = _mm256_mul_ps(rinv32,rinv32);
2323             rinvsq33         = _mm256_mul_ps(rinv33,rinv33);
2324
2325             fjx0             = _mm256_setzero_ps();
2326             fjy0             = _mm256_setzero_ps();
2327             fjz0             = _mm256_setzero_ps();
2328             fjx1             = _mm256_setzero_ps();
2329             fjy1             = _mm256_setzero_ps();
2330             fjz1             = _mm256_setzero_ps();
2331             fjx2             = _mm256_setzero_ps();
2332             fjy2             = _mm256_setzero_ps();
2333             fjz2             = _mm256_setzero_ps();
2334             fjx3             = _mm256_setzero_ps();
2335             fjy3             = _mm256_setzero_ps();
2336             fjz3             = _mm256_setzero_ps();
2337
2338             /**************************
2339              * CALCULATE INTERACTIONS *
2340              **************************/
2341
2342             if (gmx_mm256_any_lt(rsq00,rcutoff2))
2343             {
2344
2345             /* LENNARD-JONES DISPERSION/REPULSION */
2346
2347             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2348             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2349
2350             cutoff_mask      = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2351
2352             fscal            = fvdw;
2353
2354             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2355
2356             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2357
2358             /* Calculate temporary vectorial force */
2359             tx               = _mm256_mul_ps(fscal,dx00);
2360             ty               = _mm256_mul_ps(fscal,dy00);
2361             tz               = _mm256_mul_ps(fscal,dz00);
2362
2363             /* Update vectorial force */
2364             fix0             = _mm256_add_ps(fix0,tx);
2365             fiy0             = _mm256_add_ps(fiy0,ty);
2366             fiz0             = _mm256_add_ps(fiz0,tz);
2367
2368             fjx0             = _mm256_add_ps(fjx0,tx);
2369             fjy0             = _mm256_add_ps(fjy0,ty);
2370             fjz0             = _mm256_add_ps(fjz0,tz);
2371
2372             }
2373
2374             /**************************
2375              * CALCULATE INTERACTIONS *
2376              **************************/
2377
2378             if (gmx_mm256_any_lt(rsq11,rcutoff2))
2379             {
2380
2381             r11              = _mm256_mul_ps(rsq11,rinv11);
2382             r11              = _mm256_andnot_ps(dummy_mask,r11);
2383
2384             /* EWALD ELECTROSTATICS */
2385             
2386             /* Analytical PME correction */
2387             zeta2            = _mm256_mul_ps(beta2,rsq11);
2388             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2389             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2390             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2391             felec            = _mm256_mul_ps(qq11,felec);
2392             
2393             cutoff_mask      = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2394
2395             fscal            = felec;
2396
2397             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2398
2399             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2400
2401             /* Calculate temporary vectorial force */
2402             tx               = _mm256_mul_ps(fscal,dx11);
2403             ty               = _mm256_mul_ps(fscal,dy11);
2404             tz               = _mm256_mul_ps(fscal,dz11);
2405
2406             /* Update vectorial force */
2407             fix1             = _mm256_add_ps(fix1,tx);
2408             fiy1             = _mm256_add_ps(fiy1,ty);
2409             fiz1             = _mm256_add_ps(fiz1,tz);
2410
2411             fjx1             = _mm256_add_ps(fjx1,tx);
2412             fjy1             = _mm256_add_ps(fjy1,ty);
2413             fjz1             = _mm256_add_ps(fjz1,tz);
2414
2415             }
2416
2417             /**************************
2418              * CALCULATE INTERACTIONS *
2419              **************************/
2420
2421             if (gmx_mm256_any_lt(rsq12,rcutoff2))
2422             {
2423
2424             r12              = _mm256_mul_ps(rsq12,rinv12);
2425             r12              = _mm256_andnot_ps(dummy_mask,r12);
2426
2427             /* EWALD ELECTROSTATICS */
2428             
2429             /* Analytical PME correction */
2430             zeta2            = _mm256_mul_ps(beta2,rsq12);
2431             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2432             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2433             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2434             felec            = _mm256_mul_ps(qq12,felec);
2435             
2436             cutoff_mask      = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2437
2438             fscal            = felec;
2439
2440             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2441
2442             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2443
2444             /* Calculate temporary vectorial force */
2445             tx               = _mm256_mul_ps(fscal,dx12);
2446             ty               = _mm256_mul_ps(fscal,dy12);
2447             tz               = _mm256_mul_ps(fscal,dz12);
2448
2449             /* Update vectorial force */
2450             fix1             = _mm256_add_ps(fix1,tx);
2451             fiy1             = _mm256_add_ps(fiy1,ty);
2452             fiz1             = _mm256_add_ps(fiz1,tz);
2453
2454             fjx2             = _mm256_add_ps(fjx2,tx);
2455             fjy2             = _mm256_add_ps(fjy2,ty);
2456             fjz2             = _mm256_add_ps(fjz2,tz);
2457
2458             }
2459
2460             /**************************
2461              * CALCULATE INTERACTIONS *
2462              **************************/
2463
2464             if (gmx_mm256_any_lt(rsq13,rcutoff2))
2465             {
2466
2467             r13              = _mm256_mul_ps(rsq13,rinv13);
2468             r13              = _mm256_andnot_ps(dummy_mask,r13);
2469
2470             /* EWALD ELECTROSTATICS */
2471             
2472             /* Analytical PME correction */
2473             zeta2            = _mm256_mul_ps(beta2,rsq13);
2474             rinv3            = _mm256_mul_ps(rinvsq13,rinv13);
2475             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2476             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2477             felec            = _mm256_mul_ps(qq13,felec);
2478             
2479             cutoff_mask      = _mm256_cmp_ps(rsq13,rcutoff2,_CMP_LT_OQ);
2480
2481             fscal            = felec;
2482
2483             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2484
2485             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2486
2487             /* Calculate temporary vectorial force */
2488             tx               = _mm256_mul_ps(fscal,dx13);
2489             ty               = _mm256_mul_ps(fscal,dy13);
2490             tz               = _mm256_mul_ps(fscal,dz13);
2491
2492             /* Update vectorial force */
2493             fix1             = _mm256_add_ps(fix1,tx);
2494             fiy1             = _mm256_add_ps(fiy1,ty);
2495             fiz1             = _mm256_add_ps(fiz1,tz);
2496
2497             fjx3             = _mm256_add_ps(fjx3,tx);
2498             fjy3             = _mm256_add_ps(fjy3,ty);
2499             fjz3             = _mm256_add_ps(fjz3,tz);
2500
2501             }
2502
2503             /**************************
2504              * CALCULATE INTERACTIONS *
2505              **************************/
2506
2507             if (gmx_mm256_any_lt(rsq21,rcutoff2))
2508             {
2509
2510             r21              = _mm256_mul_ps(rsq21,rinv21);
2511             r21              = _mm256_andnot_ps(dummy_mask,r21);
2512
2513             /* EWALD ELECTROSTATICS */
2514             
2515             /* Analytical PME correction */
2516             zeta2            = _mm256_mul_ps(beta2,rsq21);
2517             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2518             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2519             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2520             felec            = _mm256_mul_ps(qq21,felec);
2521             
2522             cutoff_mask      = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2523
2524             fscal            = felec;
2525
2526             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2527
2528             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2529
2530             /* Calculate temporary vectorial force */
2531             tx               = _mm256_mul_ps(fscal,dx21);
2532             ty               = _mm256_mul_ps(fscal,dy21);
2533             tz               = _mm256_mul_ps(fscal,dz21);
2534
2535             /* Update vectorial force */
2536             fix2             = _mm256_add_ps(fix2,tx);
2537             fiy2             = _mm256_add_ps(fiy2,ty);
2538             fiz2             = _mm256_add_ps(fiz2,tz);
2539
2540             fjx1             = _mm256_add_ps(fjx1,tx);
2541             fjy1             = _mm256_add_ps(fjy1,ty);
2542             fjz1             = _mm256_add_ps(fjz1,tz);
2543
2544             }
2545
2546             /**************************
2547              * CALCULATE INTERACTIONS *
2548              **************************/
2549
2550             if (gmx_mm256_any_lt(rsq22,rcutoff2))
2551             {
2552
2553             r22              = _mm256_mul_ps(rsq22,rinv22);
2554             r22              = _mm256_andnot_ps(dummy_mask,r22);
2555
2556             /* EWALD ELECTROSTATICS */
2557             
2558             /* Analytical PME correction */
2559             zeta2            = _mm256_mul_ps(beta2,rsq22);
2560             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2561             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2562             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2563             felec            = _mm256_mul_ps(qq22,felec);
2564             
2565             cutoff_mask      = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2566
2567             fscal            = felec;
2568
2569             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2570
2571             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2572
2573             /* Calculate temporary vectorial force */
2574             tx               = _mm256_mul_ps(fscal,dx22);
2575             ty               = _mm256_mul_ps(fscal,dy22);
2576             tz               = _mm256_mul_ps(fscal,dz22);
2577
2578             /* Update vectorial force */
2579             fix2             = _mm256_add_ps(fix2,tx);
2580             fiy2             = _mm256_add_ps(fiy2,ty);
2581             fiz2             = _mm256_add_ps(fiz2,tz);
2582
2583             fjx2             = _mm256_add_ps(fjx2,tx);
2584             fjy2             = _mm256_add_ps(fjy2,ty);
2585             fjz2             = _mm256_add_ps(fjz2,tz);
2586
2587             }
2588
2589             /**************************
2590              * CALCULATE INTERACTIONS *
2591              **************************/
2592
2593             if (gmx_mm256_any_lt(rsq23,rcutoff2))
2594             {
2595
2596             r23              = _mm256_mul_ps(rsq23,rinv23);
2597             r23              = _mm256_andnot_ps(dummy_mask,r23);
2598
2599             /* EWALD ELECTROSTATICS */
2600             
2601             /* Analytical PME correction */
2602             zeta2            = _mm256_mul_ps(beta2,rsq23);
2603             rinv3            = _mm256_mul_ps(rinvsq23,rinv23);
2604             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2605             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2606             felec            = _mm256_mul_ps(qq23,felec);
2607             
2608             cutoff_mask      = _mm256_cmp_ps(rsq23,rcutoff2,_CMP_LT_OQ);
2609
2610             fscal            = felec;
2611
2612             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2613
2614             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2615
2616             /* Calculate temporary vectorial force */
2617             tx               = _mm256_mul_ps(fscal,dx23);
2618             ty               = _mm256_mul_ps(fscal,dy23);
2619             tz               = _mm256_mul_ps(fscal,dz23);
2620
2621             /* Update vectorial force */
2622             fix2             = _mm256_add_ps(fix2,tx);
2623             fiy2             = _mm256_add_ps(fiy2,ty);
2624             fiz2             = _mm256_add_ps(fiz2,tz);
2625
2626             fjx3             = _mm256_add_ps(fjx3,tx);
2627             fjy3             = _mm256_add_ps(fjy3,ty);
2628             fjz3             = _mm256_add_ps(fjz3,tz);
2629
2630             }
2631
2632             /**************************
2633              * CALCULATE INTERACTIONS *
2634              **************************/
2635
2636             if (gmx_mm256_any_lt(rsq31,rcutoff2))
2637             {
2638
2639             r31              = _mm256_mul_ps(rsq31,rinv31);
2640             r31              = _mm256_andnot_ps(dummy_mask,r31);
2641
2642             /* EWALD ELECTROSTATICS */
2643             
2644             /* Analytical PME correction */
2645             zeta2            = _mm256_mul_ps(beta2,rsq31);
2646             rinv3            = _mm256_mul_ps(rinvsq31,rinv31);
2647             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2648             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2649             felec            = _mm256_mul_ps(qq31,felec);
2650             
2651             cutoff_mask      = _mm256_cmp_ps(rsq31,rcutoff2,_CMP_LT_OQ);
2652
2653             fscal            = felec;
2654
2655             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2656
2657             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2658
2659             /* Calculate temporary vectorial force */
2660             tx               = _mm256_mul_ps(fscal,dx31);
2661             ty               = _mm256_mul_ps(fscal,dy31);
2662             tz               = _mm256_mul_ps(fscal,dz31);
2663
2664             /* Update vectorial force */
2665             fix3             = _mm256_add_ps(fix3,tx);
2666             fiy3             = _mm256_add_ps(fiy3,ty);
2667             fiz3             = _mm256_add_ps(fiz3,tz);
2668
2669             fjx1             = _mm256_add_ps(fjx1,tx);
2670             fjy1             = _mm256_add_ps(fjy1,ty);
2671             fjz1             = _mm256_add_ps(fjz1,tz);
2672
2673             }
2674
2675             /**************************
2676              * CALCULATE INTERACTIONS *
2677              **************************/
2678
2679             if (gmx_mm256_any_lt(rsq32,rcutoff2))
2680             {
2681
2682             r32              = _mm256_mul_ps(rsq32,rinv32);
2683             r32              = _mm256_andnot_ps(dummy_mask,r32);
2684
2685             /* EWALD ELECTROSTATICS */
2686             
2687             /* Analytical PME correction */
2688             zeta2            = _mm256_mul_ps(beta2,rsq32);
2689             rinv3            = _mm256_mul_ps(rinvsq32,rinv32);
2690             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2691             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2692             felec            = _mm256_mul_ps(qq32,felec);
2693             
2694             cutoff_mask      = _mm256_cmp_ps(rsq32,rcutoff2,_CMP_LT_OQ);
2695
2696             fscal            = felec;
2697
2698             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2699
2700             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2701
2702             /* Calculate temporary vectorial force */
2703             tx               = _mm256_mul_ps(fscal,dx32);
2704             ty               = _mm256_mul_ps(fscal,dy32);
2705             tz               = _mm256_mul_ps(fscal,dz32);
2706
2707             /* Update vectorial force */
2708             fix3             = _mm256_add_ps(fix3,tx);
2709             fiy3             = _mm256_add_ps(fiy3,ty);
2710             fiz3             = _mm256_add_ps(fiz3,tz);
2711
2712             fjx2             = _mm256_add_ps(fjx2,tx);
2713             fjy2             = _mm256_add_ps(fjy2,ty);
2714             fjz2             = _mm256_add_ps(fjz2,tz);
2715
2716             }
2717
2718             /**************************
2719              * CALCULATE INTERACTIONS *
2720              **************************/
2721
2722             if (gmx_mm256_any_lt(rsq33,rcutoff2))
2723             {
2724
2725             r33              = _mm256_mul_ps(rsq33,rinv33);
2726             r33              = _mm256_andnot_ps(dummy_mask,r33);
2727
2728             /* EWALD ELECTROSTATICS */
2729             
2730             /* Analytical PME correction */
2731             zeta2            = _mm256_mul_ps(beta2,rsq33);
2732             rinv3            = _mm256_mul_ps(rinvsq33,rinv33);
2733             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
2734             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2735             felec            = _mm256_mul_ps(qq33,felec);
2736             
2737             cutoff_mask      = _mm256_cmp_ps(rsq33,rcutoff2,_CMP_LT_OQ);
2738
2739             fscal            = felec;
2740
2741             fscal            = _mm256_and_ps(fscal,cutoff_mask);
2742
2743             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2744
2745             /* Calculate temporary vectorial force */
2746             tx               = _mm256_mul_ps(fscal,dx33);
2747             ty               = _mm256_mul_ps(fscal,dy33);
2748             tz               = _mm256_mul_ps(fscal,dz33);
2749
2750             /* Update vectorial force */
2751             fix3             = _mm256_add_ps(fix3,tx);
2752             fiy3             = _mm256_add_ps(fiy3,ty);
2753             fiz3             = _mm256_add_ps(fiz3,tz);
2754
2755             fjx3             = _mm256_add_ps(fjx3,tx);
2756             fjy3             = _mm256_add_ps(fjy3,ty);
2757             fjz3             = _mm256_add_ps(fjz3,tz);
2758
2759             }
2760
2761             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2762             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2763             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2764             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2765             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2766             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2767             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2768             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2769
2770             gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2771                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2772                                                       fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2773
2774             /* Inner loop uses 573 flops */
2775         }
2776
2777         /* End of innermost loop */
2778
2779         gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2780                                                  f+i_coord_offset,fshift+i_shift_offset);
2781
2782         /* Increment number of inner iterations */
2783         inneriter                  += j_index_end - j_index_start;
2784
2785         /* Outer loop uses 24 flops */
2786     }
2787
2788     /* Increment number of outer iterations */
2789     outeriter        += nri;
2790
2791     /* Update outer/inner flops */
2792
2793     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*573);
2794 }