Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEw_VdwLJ_GeomW3W3_avx_256_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_single
51  * Electrostatics interaction: Ewald
52  * VdW interaction:            LennardJones
53  * Geometry:                   Water3-Water3
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecEw_VdwLJ_GeomW3W3_VF_avx_256_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrE,jnrF,jnrG,jnrH;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
83     real             scratch[4*DIM];
84     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     real *           vdwioffsetptr0;
86     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87     real *           vdwioffsetptr1;
88     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89     real *           vdwioffsetptr2;
90     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
92     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
94     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
96     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
107     real             *charge;
108     int              nvdwtype;
109     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
110     int              *vdwtype;
111     real             *vdwparam;
112     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
113     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
114     __m256i          ewitab;
115     __m128i          ewitab_lo,ewitab_hi;
116     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
117     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
118     real             *ewtab;
119     __m256           dummy_mask,cutoff_mask;
120     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
121     __m256           one     = _mm256_set1_ps(1.0);
122     __m256           two     = _mm256_set1_ps(2.0);
123     x                = xx[0];
124     f                = ff[0];
125
126     nri              = nlist->nri;
127     iinr             = nlist->iinr;
128     jindex           = nlist->jindex;
129     jjnr             = nlist->jjnr;
130     shiftidx         = nlist->shift;
131     gid              = nlist->gid;
132     shiftvec         = fr->shift_vec[0];
133     fshift           = fr->fshift[0];
134     facel            = _mm256_set1_ps(fr->ic->epsfac);
135     charge           = mdatoms->chargeA;
136     nvdwtype         = fr->ntype;
137     vdwparam         = fr->nbfp;
138     vdwtype          = mdatoms->typeA;
139
140     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
141     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
142     beta2            = _mm256_mul_ps(beta,beta);
143     beta3            = _mm256_mul_ps(beta,beta2);
144
145     ewtab            = fr->ic->tabq_coul_FDV0;
146     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
147     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
148
149     /* Setup water-specific parameters */
150     inr              = nlist->iinr[0];
151     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
152     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
153     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
154     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
155
156     jq0              = _mm256_set1_ps(charge[inr+0]);
157     jq1              = _mm256_set1_ps(charge[inr+1]);
158     jq2              = _mm256_set1_ps(charge[inr+2]);
159     vdwjidx0A        = 2*vdwtype[inr+0];
160     qq00             = _mm256_mul_ps(iq0,jq0);
161     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
162     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
163     qq01             = _mm256_mul_ps(iq0,jq1);
164     qq02             = _mm256_mul_ps(iq0,jq2);
165     qq10             = _mm256_mul_ps(iq1,jq0);
166     qq11             = _mm256_mul_ps(iq1,jq1);
167     qq12             = _mm256_mul_ps(iq1,jq2);
168     qq20             = _mm256_mul_ps(iq2,jq0);
169     qq21             = _mm256_mul_ps(iq2,jq1);
170     qq22             = _mm256_mul_ps(iq2,jq2);
171
172     /* Avoid stupid compiler warnings */
173     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
174     j_coord_offsetA = 0;
175     j_coord_offsetB = 0;
176     j_coord_offsetC = 0;
177     j_coord_offsetD = 0;
178     j_coord_offsetE = 0;
179     j_coord_offsetF = 0;
180     j_coord_offsetG = 0;
181     j_coord_offsetH = 0;
182
183     outeriter        = 0;
184     inneriter        = 0;
185
186     for(iidx=0;iidx<4*DIM;iidx++)
187     {
188         scratch[iidx] = 0.0;
189     }
190
191     /* Start outer loop over neighborlists */
192     for(iidx=0; iidx<nri; iidx++)
193     {
194         /* Load shift vector for this list */
195         i_shift_offset   = DIM*shiftidx[iidx];
196
197         /* Load limits for loop over neighbors */
198         j_index_start    = jindex[iidx];
199         j_index_end      = jindex[iidx+1];
200
201         /* Get outer coordinate index */
202         inr              = iinr[iidx];
203         i_coord_offset   = DIM*inr;
204
205         /* Load i particle coords and add shift vector */
206         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
207                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
208
209         fix0             = _mm256_setzero_ps();
210         fiy0             = _mm256_setzero_ps();
211         fiz0             = _mm256_setzero_ps();
212         fix1             = _mm256_setzero_ps();
213         fiy1             = _mm256_setzero_ps();
214         fiz1             = _mm256_setzero_ps();
215         fix2             = _mm256_setzero_ps();
216         fiy2             = _mm256_setzero_ps();
217         fiz2             = _mm256_setzero_ps();
218
219         /* Reset potential sums */
220         velecsum         = _mm256_setzero_ps();
221         vvdwsum          = _mm256_setzero_ps();
222
223         /* Start inner kernel loop */
224         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
225         {
226
227             /* Get j neighbor index, and coordinate index */
228             jnrA             = jjnr[jidx];
229             jnrB             = jjnr[jidx+1];
230             jnrC             = jjnr[jidx+2];
231             jnrD             = jjnr[jidx+3];
232             jnrE             = jjnr[jidx+4];
233             jnrF             = jjnr[jidx+5];
234             jnrG             = jjnr[jidx+6];
235             jnrH             = jjnr[jidx+7];
236             j_coord_offsetA  = DIM*jnrA;
237             j_coord_offsetB  = DIM*jnrB;
238             j_coord_offsetC  = DIM*jnrC;
239             j_coord_offsetD  = DIM*jnrD;
240             j_coord_offsetE  = DIM*jnrE;
241             j_coord_offsetF  = DIM*jnrF;
242             j_coord_offsetG  = DIM*jnrG;
243             j_coord_offsetH  = DIM*jnrH;
244
245             /* load j atom coordinates */
246             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
247                                                  x+j_coord_offsetC,x+j_coord_offsetD,
248                                                  x+j_coord_offsetE,x+j_coord_offsetF,
249                                                  x+j_coord_offsetG,x+j_coord_offsetH,
250                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
251
252             /* Calculate displacement vector */
253             dx00             = _mm256_sub_ps(ix0,jx0);
254             dy00             = _mm256_sub_ps(iy0,jy0);
255             dz00             = _mm256_sub_ps(iz0,jz0);
256             dx01             = _mm256_sub_ps(ix0,jx1);
257             dy01             = _mm256_sub_ps(iy0,jy1);
258             dz01             = _mm256_sub_ps(iz0,jz1);
259             dx02             = _mm256_sub_ps(ix0,jx2);
260             dy02             = _mm256_sub_ps(iy0,jy2);
261             dz02             = _mm256_sub_ps(iz0,jz2);
262             dx10             = _mm256_sub_ps(ix1,jx0);
263             dy10             = _mm256_sub_ps(iy1,jy0);
264             dz10             = _mm256_sub_ps(iz1,jz0);
265             dx11             = _mm256_sub_ps(ix1,jx1);
266             dy11             = _mm256_sub_ps(iy1,jy1);
267             dz11             = _mm256_sub_ps(iz1,jz1);
268             dx12             = _mm256_sub_ps(ix1,jx2);
269             dy12             = _mm256_sub_ps(iy1,jy2);
270             dz12             = _mm256_sub_ps(iz1,jz2);
271             dx20             = _mm256_sub_ps(ix2,jx0);
272             dy20             = _mm256_sub_ps(iy2,jy0);
273             dz20             = _mm256_sub_ps(iz2,jz0);
274             dx21             = _mm256_sub_ps(ix2,jx1);
275             dy21             = _mm256_sub_ps(iy2,jy1);
276             dz21             = _mm256_sub_ps(iz2,jz1);
277             dx22             = _mm256_sub_ps(ix2,jx2);
278             dy22             = _mm256_sub_ps(iy2,jy2);
279             dz22             = _mm256_sub_ps(iz2,jz2);
280
281             /* Calculate squared distance and things based on it */
282             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
283             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
284             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
285             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
286             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
287             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
288             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
289             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
290             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
291
292             rinv00           = avx256_invsqrt_f(rsq00);
293             rinv01           = avx256_invsqrt_f(rsq01);
294             rinv02           = avx256_invsqrt_f(rsq02);
295             rinv10           = avx256_invsqrt_f(rsq10);
296             rinv11           = avx256_invsqrt_f(rsq11);
297             rinv12           = avx256_invsqrt_f(rsq12);
298             rinv20           = avx256_invsqrt_f(rsq20);
299             rinv21           = avx256_invsqrt_f(rsq21);
300             rinv22           = avx256_invsqrt_f(rsq22);
301
302             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
303             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
304             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
305             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
306             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
307             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
308             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
309             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
310             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
311
312             fjx0             = _mm256_setzero_ps();
313             fjy0             = _mm256_setzero_ps();
314             fjz0             = _mm256_setzero_ps();
315             fjx1             = _mm256_setzero_ps();
316             fjy1             = _mm256_setzero_ps();
317             fjz1             = _mm256_setzero_ps();
318             fjx2             = _mm256_setzero_ps();
319             fjy2             = _mm256_setzero_ps();
320             fjz2             = _mm256_setzero_ps();
321
322             /**************************
323              * CALCULATE INTERACTIONS *
324              **************************/
325
326             r00              = _mm256_mul_ps(rsq00,rinv00);
327
328             /* EWALD ELECTROSTATICS */
329             
330             /* Analytical PME correction */
331             zeta2            = _mm256_mul_ps(beta2,rsq00);
332             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
333             pmecorrF         = avx256_pmecorrF_f(zeta2);
334             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
335             felec            = _mm256_mul_ps(qq00,felec);
336             pmecorrV         = avx256_pmecorrV_f(zeta2);
337             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
338             velec            = _mm256_sub_ps(rinv00,pmecorrV);
339             velec            = _mm256_mul_ps(qq00,velec);
340             
341             /* LENNARD-JONES DISPERSION/REPULSION */
342
343             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
344             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
345             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
346             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
347             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
348
349             /* Update potential sum for this i atom from the interaction with this j atom. */
350             velecsum         = _mm256_add_ps(velecsum,velec);
351             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
352
353             fscal            = _mm256_add_ps(felec,fvdw);
354
355             /* Calculate temporary vectorial force */
356             tx               = _mm256_mul_ps(fscal,dx00);
357             ty               = _mm256_mul_ps(fscal,dy00);
358             tz               = _mm256_mul_ps(fscal,dz00);
359
360             /* Update vectorial force */
361             fix0             = _mm256_add_ps(fix0,tx);
362             fiy0             = _mm256_add_ps(fiy0,ty);
363             fiz0             = _mm256_add_ps(fiz0,tz);
364
365             fjx0             = _mm256_add_ps(fjx0,tx);
366             fjy0             = _mm256_add_ps(fjy0,ty);
367             fjz0             = _mm256_add_ps(fjz0,tz);
368
369             /**************************
370              * CALCULATE INTERACTIONS *
371              **************************/
372
373             r01              = _mm256_mul_ps(rsq01,rinv01);
374
375             /* EWALD ELECTROSTATICS */
376             
377             /* Analytical PME correction */
378             zeta2            = _mm256_mul_ps(beta2,rsq01);
379             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
380             pmecorrF         = avx256_pmecorrF_f(zeta2);
381             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
382             felec            = _mm256_mul_ps(qq01,felec);
383             pmecorrV         = avx256_pmecorrV_f(zeta2);
384             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
385             velec            = _mm256_sub_ps(rinv01,pmecorrV);
386             velec            = _mm256_mul_ps(qq01,velec);
387             
388             /* Update potential sum for this i atom from the interaction with this j atom. */
389             velecsum         = _mm256_add_ps(velecsum,velec);
390
391             fscal            = felec;
392
393             /* Calculate temporary vectorial force */
394             tx               = _mm256_mul_ps(fscal,dx01);
395             ty               = _mm256_mul_ps(fscal,dy01);
396             tz               = _mm256_mul_ps(fscal,dz01);
397
398             /* Update vectorial force */
399             fix0             = _mm256_add_ps(fix0,tx);
400             fiy0             = _mm256_add_ps(fiy0,ty);
401             fiz0             = _mm256_add_ps(fiz0,tz);
402
403             fjx1             = _mm256_add_ps(fjx1,tx);
404             fjy1             = _mm256_add_ps(fjy1,ty);
405             fjz1             = _mm256_add_ps(fjz1,tz);
406
407             /**************************
408              * CALCULATE INTERACTIONS *
409              **************************/
410
411             r02              = _mm256_mul_ps(rsq02,rinv02);
412
413             /* EWALD ELECTROSTATICS */
414             
415             /* Analytical PME correction */
416             zeta2            = _mm256_mul_ps(beta2,rsq02);
417             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
418             pmecorrF         = avx256_pmecorrF_f(zeta2);
419             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
420             felec            = _mm256_mul_ps(qq02,felec);
421             pmecorrV         = avx256_pmecorrV_f(zeta2);
422             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
423             velec            = _mm256_sub_ps(rinv02,pmecorrV);
424             velec            = _mm256_mul_ps(qq02,velec);
425             
426             /* Update potential sum for this i atom from the interaction with this j atom. */
427             velecsum         = _mm256_add_ps(velecsum,velec);
428
429             fscal            = felec;
430
431             /* Calculate temporary vectorial force */
432             tx               = _mm256_mul_ps(fscal,dx02);
433             ty               = _mm256_mul_ps(fscal,dy02);
434             tz               = _mm256_mul_ps(fscal,dz02);
435
436             /* Update vectorial force */
437             fix0             = _mm256_add_ps(fix0,tx);
438             fiy0             = _mm256_add_ps(fiy0,ty);
439             fiz0             = _mm256_add_ps(fiz0,tz);
440
441             fjx2             = _mm256_add_ps(fjx2,tx);
442             fjy2             = _mm256_add_ps(fjy2,ty);
443             fjz2             = _mm256_add_ps(fjz2,tz);
444
445             /**************************
446              * CALCULATE INTERACTIONS *
447              **************************/
448
449             r10              = _mm256_mul_ps(rsq10,rinv10);
450
451             /* EWALD ELECTROSTATICS */
452             
453             /* Analytical PME correction */
454             zeta2            = _mm256_mul_ps(beta2,rsq10);
455             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
456             pmecorrF         = avx256_pmecorrF_f(zeta2);
457             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
458             felec            = _mm256_mul_ps(qq10,felec);
459             pmecorrV         = avx256_pmecorrV_f(zeta2);
460             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
461             velec            = _mm256_sub_ps(rinv10,pmecorrV);
462             velec            = _mm256_mul_ps(qq10,velec);
463             
464             /* Update potential sum for this i atom from the interaction with this j atom. */
465             velecsum         = _mm256_add_ps(velecsum,velec);
466
467             fscal            = felec;
468
469             /* Calculate temporary vectorial force */
470             tx               = _mm256_mul_ps(fscal,dx10);
471             ty               = _mm256_mul_ps(fscal,dy10);
472             tz               = _mm256_mul_ps(fscal,dz10);
473
474             /* Update vectorial force */
475             fix1             = _mm256_add_ps(fix1,tx);
476             fiy1             = _mm256_add_ps(fiy1,ty);
477             fiz1             = _mm256_add_ps(fiz1,tz);
478
479             fjx0             = _mm256_add_ps(fjx0,tx);
480             fjy0             = _mm256_add_ps(fjy0,ty);
481             fjz0             = _mm256_add_ps(fjz0,tz);
482
483             /**************************
484              * CALCULATE INTERACTIONS *
485              **************************/
486
487             r11              = _mm256_mul_ps(rsq11,rinv11);
488
489             /* EWALD ELECTROSTATICS */
490             
491             /* Analytical PME correction */
492             zeta2            = _mm256_mul_ps(beta2,rsq11);
493             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
494             pmecorrF         = avx256_pmecorrF_f(zeta2);
495             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
496             felec            = _mm256_mul_ps(qq11,felec);
497             pmecorrV         = avx256_pmecorrV_f(zeta2);
498             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
499             velec            = _mm256_sub_ps(rinv11,pmecorrV);
500             velec            = _mm256_mul_ps(qq11,velec);
501             
502             /* Update potential sum for this i atom from the interaction with this j atom. */
503             velecsum         = _mm256_add_ps(velecsum,velec);
504
505             fscal            = felec;
506
507             /* Calculate temporary vectorial force */
508             tx               = _mm256_mul_ps(fscal,dx11);
509             ty               = _mm256_mul_ps(fscal,dy11);
510             tz               = _mm256_mul_ps(fscal,dz11);
511
512             /* Update vectorial force */
513             fix1             = _mm256_add_ps(fix1,tx);
514             fiy1             = _mm256_add_ps(fiy1,ty);
515             fiz1             = _mm256_add_ps(fiz1,tz);
516
517             fjx1             = _mm256_add_ps(fjx1,tx);
518             fjy1             = _mm256_add_ps(fjy1,ty);
519             fjz1             = _mm256_add_ps(fjz1,tz);
520
521             /**************************
522              * CALCULATE INTERACTIONS *
523              **************************/
524
525             r12              = _mm256_mul_ps(rsq12,rinv12);
526
527             /* EWALD ELECTROSTATICS */
528             
529             /* Analytical PME correction */
530             zeta2            = _mm256_mul_ps(beta2,rsq12);
531             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
532             pmecorrF         = avx256_pmecorrF_f(zeta2);
533             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
534             felec            = _mm256_mul_ps(qq12,felec);
535             pmecorrV         = avx256_pmecorrV_f(zeta2);
536             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
537             velec            = _mm256_sub_ps(rinv12,pmecorrV);
538             velec            = _mm256_mul_ps(qq12,velec);
539             
540             /* Update potential sum for this i atom from the interaction with this j atom. */
541             velecsum         = _mm256_add_ps(velecsum,velec);
542
543             fscal            = felec;
544
545             /* Calculate temporary vectorial force */
546             tx               = _mm256_mul_ps(fscal,dx12);
547             ty               = _mm256_mul_ps(fscal,dy12);
548             tz               = _mm256_mul_ps(fscal,dz12);
549
550             /* Update vectorial force */
551             fix1             = _mm256_add_ps(fix1,tx);
552             fiy1             = _mm256_add_ps(fiy1,ty);
553             fiz1             = _mm256_add_ps(fiz1,tz);
554
555             fjx2             = _mm256_add_ps(fjx2,tx);
556             fjy2             = _mm256_add_ps(fjy2,ty);
557             fjz2             = _mm256_add_ps(fjz2,tz);
558
559             /**************************
560              * CALCULATE INTERACTIONS *
561              **************************/
562
563             r20              = _mm256_mul_ps(rsq20,rinv20);
564
565             /* EWALD ELECTROSTATICS */
566             
567             /* Analytical PME correction */
568             zeta2            = _mm256_mul_ps(beta2,rsq20);
569             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
570             pmecorrF         = avx256_pmecorrF_f(zeta2);
571             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
572             felec            = _mm256_mul_ps(qq20,felec);
573             pmecorrV         = avx256_pmecorrV_f(zeta2);
574             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
575             velec            = _mm256_sub_ps(rinv20,pmecorrV);
576             velec            = _mm256_mul_ps(qq20,velec);
577             
578             /* Update potential sum for this i atom from the interaction with this j atom. */
579             velecsum         = _mm256_add_ps(velecsum,velec);
580
581             fscal            = felec;
582
583             /* Calculate temporary vectorial force */
584             tx               = _mm256_mul_ps(fscal,dx20);
585             ty               = _mm256_mul_ps(fscal,dy20);
586             tz               = _mm256_mul_ps(fscal,dz20);
587
588             /* Update vectorial force */
589             fix2             = _mm256_add_ps(fix2,tx);
590             fiy2             = _mm256_add_ps(fiy2,ty);
591             fiz2             = _mm256_add_ps(fiz2,tz);
592
593             fjx0             = _mm256_add_ps(fjx0,tx);
594             fjy0             = _mm256_add_ps(fjy0,ty);
595             fjz0             = _mm256_add_ps(fjz0,tz);
596
597             /**************************
598              * CALCULATE INTERACTIONS *
599              **************************/
600
601             r21              = _mm256_mul_ps(rsq21,rinv21);
602
603             /* EWALD ELECTROSTATICS */
604             
605             /* Analytical PME correction */
606             zeta2            = _mm256_mul_ps(beta2,rsq21);
607             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
608             pmecorrF         = avx256_pmecorrF_f(zeta2);
609             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
610             felec            = _mm256_mul_ps(qq21,felec);
611             pmecorrV         = avx256_pmecorrV_f(zeta2);
612             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
613             velec            = _mm256_sub_ps(rinv21,pmecorrV);
614             velec            = _mm256_mul_ps(qq21,velec);
615             
616             /* Update potential sum for this i atom from the interaction with this j atom. */
617             velecsum         = _mm256_add_ps(velecsum,velec);
618
619             fscal            = felec;
620
621             /* Calculate temporary vectorial force */
622             tx               = _mm256_mul_ps(fscal,dx21);
623             ty               = _mm256_mul_ps(fscal,dy21);
624             tz               = _mm256_mul_ps(fscal,dz21);
625
626             /* Update vectorial force */
627             fix2             = _mm256_add_ps(fix2,tx);
628             fiy2             = _mm256_add_ps(fiy2,ty);
629             fiz2             = _mm256_add_ps(fiz2,tz);
630
631             fjx1             = _mm256_add_ps(fjx1,tx);
632             fjy1             = _mm256_add_ps(fjy1,ty);
633             fjz1             = _mm256_add_ps(fjz1,tz);
634
635             /**************************
636              * CALCULATE INTERACTIONS *
637              **************************/
638
639             r22              = _mm256_mul_ps(rsq22,rinv22);
640
641             /* EWALD ELECTROSTATICS */
642             
643             /* Analytical PME correction */
644             zeta2            = _mm256_mul_ps(beta2,rsq22);
645             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
646             pmecorrF         = avx256_pmecorrF_f(zeta2);
647             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
648             felec            = _mm256_mul_ps(qq22,felec);
649             pmecorrV         = avx256_pmecorrV_f(zeta2);
650             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
651             velec            = _mm256_sub_ps(rinv22,pmecorrV);
652             velec            = _mm256_mul_ps(qq22,velec);
653             
654             /* Update potential sum for this i atom from the interaction with this j atom. */
655             velecsum         = _mm256_add_ps(velecsum,velec);
656
657             fscal            = felec;
658
659             /* Calculate temporary vectorial force */
660             tx               = _mm256_mul_ps(fscal,dx22);
661             ty               = _mm256_mul_ps(fscal,dy22);
662             tz               = _mm256_mul_ps(fscal,dz22);
663
664             /* Update vectorial force */
665             fix2             = _mm256_add_ps(fix2,tx);
666             fiy2             = _mm256_add_ps(fiy2,ty);
667             fiz2             = _mm256_add_ps(fiz2,tz);
668
669             fjx2             = _mm256_add_ps(fjx2,tx);
670             fjy2             = _mm256_add_ps(fjy2,ty);
671             fjz2             = _mm256_add_ps(fjz2,tz);
672
673             fjptrA             = f+j_coord_offsetA;
674             fjptrB             = f+j_coord_offsetB;
675             fjptrC             = f+j_coord_offsetC;
676             fjptrD             = f+j_coord_offsetD;
677             fjptrE             = f+j_coord_offsetE;
678             fjptrF             = f+j_coord_offsetF;
679             fjptrG             = f+j_coord_offsetG;
680             fjptrH             = f+j_coord_offsetH;
681
682             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
683                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
684
685             /* Inner loop uses 768 flops */
686         }
687
688         if(jidx<j_index_end)
689         {
690
691             /* Get j neighbor index, and coordinate index */
692             jnrlistA         = jjnr[jidx];
693             jnrlistB         = jjnr[jidx+1];
694             jnrlistC         = jjnr[jidx+2];
695             jnrlistD         = jjnr[jidx+3];
696             jnrlistE         = jjnr[jidx+4];
697             jnrlistF         = jjnr[jidx+5];
698             jnrlistG         = jjnr[jidx+6];
699             jnrlistH         = jjnr[jidx+7];
700             /* Sign of each element will be negative for non-real atoms.
701              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
702              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
703              */
704             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
705                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
706                                             
707             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
708             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
709             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
710             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
711             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
712             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
713             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
714             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
715             j_coord_offsetA  = DIM*jnrA;
716             j_coord_offsetB  = DIM*jnrB;
717             j_coord_offsetC  = DIM*jnrC;
718             j_coord_offsetD  = DIM*jnrD;
719             j_coord_offsetE  = DIM*jnrE;
720             j_coord_offsetF  = DIM*jnrF;
721             j_coord_offsetG  = DIM*jnrG;
722             j_coord_offsetH  = DIM*jnrH;
723
724             /* load j atom coordinates */
725             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
726                                                  x+j_coord_offsetC,x+j_coord_offsetD,
727                                                  x+j_coord_offsetE,x+j_coord_offsetF,
728                                                  x+j_coord_offsetG,x+j_coord_offsetH,
729                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
730
731             /* Calculate displacement vector */
732             dx00             = _mm256_sub_ps(ix0,jx0);
733             dy00             = _mm256_sub_ps(iy0,jy0);
734             dz00             = _mm256_sub_ps(iz0,jz0);
735             dx01             = _mm256_sub_ps(ix0,jx1);
736             dy01             = _mm256_sub_ps(iy0,jy1);
737             dz01             = _mm256_sub_ps(iz0,jz1);
738             dx02             = _mm256_sub_ps(ix0,jx2);
739             dy02             = _mm256_sub_ps(iy0,jy2);
740             dz02             = _mm256_sub_ps(iz0,jz2);
741             dx10             = _mm256_sub_ps(ix1,jx0);
742             dy10             = _mm256_sub_ps(iy1,jy0);
743             dz10             = _mm256_sub_ps(iz1,jz0);
744             dx11             = _mm256_sub_ps(ix1,jx1);
745             dy11             = _mm256_sub_ps(iy1,jy1);
746             dz11             = _mm256_sub_ps(iz1,jz1);
747             dx12             = _mm256_sub_ps(ix1,jx2);
748             dy12             = _mm256_sub_ps(iy1,jy2);
749             dz12             = _mm256_sub_ps(iz1,jz2);
750             dx20             = _mm256_sub_ps(ix2,jx0);
751             dy20             = _mm256_sub_ps(iy2,jy0);
752             dz20             = _mm256_sub_ps(iz2,jz0);
753             dx21             = _mm256_sub_ps(ix2,jx1);
754             dy21             = _mm256_sub_ps(iy2,jy1);
755             dz21             = _mm256_sub_ps(iz2,jz1);
756             dx22             = _mm256_sub_ps(ix2,jx2);
757             dy22             = _mm256_sub_ps(iy2,jy2);
758             dz22             = _mm256_sub_ps(iz2,jz2);
759
760             /* Calculate squared distance and things based on it */
761             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
762             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
763             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
764             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
765             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
766             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
767             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
768             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
769             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
770
771             rinv00           = avx256_invsqrt_f(rsq00);
772             rinv01           = avx256_invsqrt_f(rsq01);
773             rinv02           = avx256_invsqrt_f(rsq02);
774             rinv10           = avx256_invsqrt_f(rsq10);
775             rinv11           = avx256_invsqrt_f(rsq11);
776             rinv12           = avx256_invsqrt_f(rsq12);
777             rinv20           = avx256_invsqrt_f(rsq20);
778             rinv21           = avx256_invsqrt_f(rsq21);
779             rinv22           = avx256_invsqrt_f(rsq22);
780
781             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
782             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
783             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
784             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
785             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
786             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
787             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
788             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
789             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
790
791             fjx0             = _mm256_setzero_ps();
792             fjy0             = _mm256_setzero_ps();
793             fjz0             = _mm256_setzero_ps();
794             fjx1             = _mm256_setzero_ps();
795             fjy1             = _mm256_setzero_ps();
796             fjz1             = _mm256_setzero_ps();
797             fjx2             = _mm256_setzero_ps();
798             fjy2             = _mm256_setzero_ps();
799             fjz2             = _mm256_setzero_ps();
800
801             /**************************
802              * CALCULATE INTERACTIONS *
803              **************************/
804
805             r00              = _mm256_mul_ps(rsq00,rinv00);
806             r00              = _mm256_andnot_ps(dummy_mask,r00);
807
808             /* EWALD ELECTROSTATICS */
809             
810             /* Analytical PME correction */
811             zeta2            = _mm256_mul_ps(beta2,rsq00);
812             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
813             pmecorrF         = avx256_pmecorrF_f(zeta2);
814             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
815             felec            = _mm256_mul_ps(qq00,felec);
816             pmecorrV         = avx256_pmecorrV_f(zeta2);
817             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
818             velec            = _mm256_sub_ps(rinv00,pmecorrV);
819             velec            = _mm256_mul_ps(qq00,velec);
820             
821             /* LENNARD-JONES DISPERSION/REPULSION */
822
823             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
824             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
825             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
826             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
827             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
828
829             /* Update potential sum for this i atom from the interaction with this j atom. */
830             velec            = _mm256_andnot_ps(dummy_mask,velec);
831             velecsum         = _mm256_add_ps(velecsum,velec);
832             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
833             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
834
835             fscal            = _mm256_add_ps(felec,fvdw);
836
837             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
838
839             /* Calculate temporary vectorial force */
840             tx               = _mm256_mul_ps(fscal,dx00);
841             ty               = _mm256_mul_ps(fscal,dy00);
842             tz               = _mm256_mul_ps(fscal,dz00);
843
844             /* Update vectorial force */
845             fix0             = _mm256_add_ps(fix0,tx);
846             fiy0             = _mm256_add_ps(fiy0,ty);
847             fiz0             = _mm256_add_ps(fiz0,tz);
848
849             fjx0             = _mm256_add_ps(fjx0,tx);
850             fjy0             = _mm256_add_ps(fjy0,ty);
851             fjz0             = _mm256_add_ps(fjz0,tz);
852
853             /**************************
854              * CALCULATE INTERACTIONS *
855              **************************/
856
857             r01              = _mm256_mul_ps(rsq01,rinv01);
858             r01              = _mm256_andnot_ps(dummy_mask,r01);
859
860             /* EWALD ELECTROSTATICS */
861             
862             /* Analytical PME correction */
863             zeta2            = _mm256_mul_ps(beta2,rsq01);
864             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
865             pmecorrF         = avx256_pmecorrF_f(zeta2);
866             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
867             felec            = _mm256_mul_ps(qq01,felec);
868             pmecorrV         = avx256_pmecorrV_f(zeta2);
869             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
870             velec            = _mm256_sub_ps(rinv01,pmecorrV);
871             velec            = _mm256_mul_ps(qq01,velec);
872             
873             /* Update potential sum for this i atom from the interaction with this j atom. */
874             velec            = _mm256_andnot_ps(dummy_mask,velec);
875             velecsum         = _mm256_add_ps(velecsum,velec);
876
877             fscal            = felec;
878
879             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
880
881             /* Calculate temporary vectorial force */
882             tx               = _mm256_mul_ps(fscal,dx01);
883             ty               = _mm256_mul_ps(fscal,dy01);
884             tz               = _mm256_mul_ps(fscal,dz01);
885
886             /* Update vectorial force */
887             fix0             = _mm256_add_ps(fix0,tx);
888             fiy0             = _mm256_add_ps(fiy0,ty);
889             fiz0             = _mm256_add_ps(fiz0,tz);
890
891             fjx1             = _mm256_add_ps(fjx1,tx);
892             fjy1             = _mm256_add_ps(fjy1,ty);
893             fjz1             = _mm256_add_ps(fjz1,tz);
894
895             /**************************
896              * CALCULATE INTERACTIONS *
897              **************************/
898
899             r02              = _mm256_mul_ps(rsq02,rinv02);
900             r02              = _mm256_andnot_ps(dummy_mask,r02);
901
902             /* EWALD ELECTROSTATICS */
903             
904             /* Analytical PME correction */
905             zeta2            = _mm256_mul_ps(beta2,rsq02);
906             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
907             pmecorrF         = avx256_pmecorrF_f(zeta2);
908             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
909             felec            = _mm256_mul_ps(qq02,felec);
910             pmecorrV         = avx256_pmecorrV_f(zeta2);
911             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
912             velec            = _mm256_sub_ps(rinv02,pmecorrV);
913             velec            = _mm256_mul_ps(qq02,velec);
914             
915             /* Update potential sum for this i atom from the interaction with this j atom. */
916             velec            = _mm256_andnot_ps(dummy_mask,velec);
917             velecsum         = _mm256_add_ps(velecsum,velec);
918
919             fscal            = felec;
920
921             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
922
923             /* Calculate temporary vectorial force */
924             tx               = _mm256_mul_ps(fscal,dx02);
925             ty               = _mm256_mul_ps(fscal,dy02);
926             tz               = _mm256_mul_ps(fscal,dz02);
927
928             /* Update vectorial force */
929             fix0             = _mm256_add_ps(fix0,tx);
930             fiy0             = _mm256_add_ps(fiy0,ty);
931             fiz0             = _mm256_add_ps(fiz0,tz);
932
933             fjx2             = _mm256_add_ps(fjx2,tx);
934             fjy2             = _mm256_add_ps(fjy2,ty);
935             fjz2             = _mm256_add_ps(fjz2,tz);
936
937             /**************************
938              * CALCULATE INTERACTIONS *
939              **************************/
940
941             r10              = _mm256_mul_ps(rsq10,rinv10);
942             r10              = _mm256_andnot_ps(dummy_mask,r10);
943
944             /* EWALD ELECTROSTATICS */
945             
946             /* Analytical PME correction */
947             zeta2            = _mm256_mul_ps(beta2,rsq10);
948             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
949             pmecorrF         = avx256_pmecorrF_f(zeta2);
950             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
951             felec            = _mm256_mul_ps(qq10,felec);
952             pmecorrV         = avx256_pmecorrV_f(zeta2);
953             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
954             velec            = _mm256_sub_ps(rinv10,pmecorrV);
955             velec            = _mm256_mul_ps(qq10,velec);
956             
957             /* Update potential sum for this i atom from the interaction with this j atom. */
958             velec            = _mm256_andnot_ps(dummy_mask,velec);
959             velecsum         = _mm256_add_ps(velecsum,velec);
960
961             fscal            = felec;
962
963             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
964
965             /* Calculate temporary vectorial force */
966             tx               = _mm256_mul_ps(fscal,dx10);
967             ty               = _mm256_mul_ps(fscal,dy10);
968             tz               = _mm256_mul_ps(fscal,dz10);
969
970             /* Update vectorial force */
971             fix1             = _mm256_add_ps(fix1,tx);
972             fiy1             = _mm256_add_ps(fiy1,ty);
973             fiz1             = _mm256_add_ps(fiz1,tz);
974
975             fjx0             = _mm256_add_ps(fjx0,tx);
976             fjy0             = _mm256_add_ps(fjy0,ty);
977             fjz0             = _mm256_add_ps(fjz0,tz);
978
979             /**************************
980              * CALCULATE INTERACTIONS *
981              **************************/
982
983             r11              = _mm256_mul_ps(rsq11,rinv11);
984             r11              = _mm256_andnot_ps(dummy_mask,r11);
985
986             /* EWALD ELECTROSTATICS */
987             
988             /* Analytical PME correction */
989             zeta2            = _mm256_mul_ps(beta2,rsq11);
990             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
991             pmecorrF         = avx256_pmecorrF_f(zeta2);
992             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
993             felec            = _mm256_mul_ps(qq11,felec);
994             pmecorrV         = avx256_pmecorrV_f(zeta2);
995             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
996             velec            = _mm256_sub_ps(rinv11,pmecorrV);
997             velec            = _mm256_mul_ps(qq11,velec);
998             
999             /* Update potential sum for this i atom from the interaction with this j atom. */
1000             velec            = _mm256_andnot_ps(dummy_mask,velec);
1001             velecsum         = _mm256_add_ps(velecsum,velec);
1002
1003             fscal            = felec;
1004
1005             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1006
1007             /* Calculate temporary vectorial force */
1008             tx               = _mm256_mul_ps(fscal,dx11);
1009             ty               = _mm256_mul_ps(fscal,dy11);
1010             tz               = _mm256_mul_ps(fscal,dz11);
1011
1012             /* Update vectorial force */
1013             fix1             = _mm256_add_ps(fix1,tx);
1014             fiy1             = _mm256_add_ps(fiy1,ty);
1015             fiz1             = _mm256_add_ps(fiz1,tz);
1016
1017             fjx1             = _mm256_add_ps(fjx1,tx);
1018             fjy1             = _mm256_add_ps(fjy1,ty);
1019             fjz1             = _mm256_add_ps(fjz1,tz);
1020
1021             /**************************
1022              * CALCULATE INTERACTIONS *
1023              **************************/
1024
1025             r12              = _mm256_mul_ps(rsq12,rinv12);
1026             r12              = _mm256_andnot_ps(dummy_mask,r12);
1027
1028             /* EWALD ELECTROSTATICS */
1029             
1030             /* Analytical PME correction */
1031             zeta2            = _mm256_mul_ps(beta2,rsq12);
1032             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1033             pmecorrF         = avx256_pmecorrF_f(zeta2);
1034             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1035             felec            = _mm256_mul_ps(qq12,felec);
1036             pmecorrV         = avx256_pmecorrV_f(zeta2);
1037             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1038             velec            = _mm256_sub_ps(rinv12,pmecorrV);
1039             velec            = _mm256_mul_ps(qq12,velec);
1040             
1041             /* Update potential sum for this i atom from the interaction with this j atom. */
1042             velec            = _mm256_andnot_ps(dummy_mask,velec);
1043             velecsum         = _mm256_add_ps(velecsum,velec);
1044
1045             fscal            = felec;
1046
1047             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1048
1049             /* Calculate temporary vectorial force */
1050             tx               = _mm256_mul_ps(fscal,dx12);
1051             ty               = _mm256_mul_ps(fscal,dy12);
1052             tz               = _mm256_mul_ps(fscal,dz12);
1053
1054             /* Update vectorial force */
1055             fix1             = _mm256_add_ps(fix1,tx);
1056             fiy1             = _mm256_add_ps(fiy1,ty);
1057             fiz1             = _mm256_add_ps(fiz1,tz);
1058
1059             fjx2             = _mm256_add_ps(fjx2,tx);
1060             fjy2             = _mm256_add_ps(fjy2,ty);
1061             fjz2             = _mm256_add_ps(fjz2,tz);
1062
1063             /**************************
1064              * CALCULATE INTERACTIONS *
1065              **************************/
1066
1067             r20              = _mm256_mul_ps(rsq20,rinv20);
1068             r20              = _mm256_andnot_ps(dummy_mask,r20);
1069
1070             /* EWALD ELECTROSTATICS */
1071             
1072             /* Analytical PME correction */
1073             zeta2            = _mm256_mul_ps(beta2,rsq20);
1074             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1075             pmecorrF         = avx256_pmecorrF_f(zeta2);
1076             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1077             felec            = _mm256_mul_ps(qq20,felec);
1078             pmecorrV         = avx256_pmecorrV_f(zeta2);
1079             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1080             velec            = _mm256_sub_ps(rinv20,pmecorrV);
1081             velec            = _mm256_mul_ps(qq20,velec);
1082             
1083             /* Update potential sum for this i atom from the interaction with this j atom. */
1084             velec            = _mm256_andnot_ps(dummy_mask,velec);
1085             velecsum         = _mm256_add_ps(velecsum,velec);
1086
1087             fscal            = felec;
1088
1089             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1090
1091             /* Calculate temporary vectorial force */
1092             tx               = _mm256_mul_ps(fscal,dx20);
1093             ty               = _mm256_mul_ps(fscal,dy20);
1094             tz               = _mm256_mul_ps(fscal,dz20);
1095
1096             /* Update vectorial force */
1097             fix2             = _mm256_add_ps(fix2,tx);
1098             fiy2             = _mm256_add_ps(fiy2,ty);
1099             fiz2             = _mm256_add_ps(fiz2,tz);
1100
1101             fjx0             = _mm256_add_ps(fjx0,tx);
1102             fjy0             = _mm256_add_ps(fjy0,ty);
1103             fjz0             = _mm256_add_ps(fjz0,tz);
1104
1105             /**************************
1106              * CALCULATE INTERACTIONS *
1107              **************************/
1108
1109             r21              = _mm256_mul_ps(rsq21,rinv21);
1110             r21              = _mm256_andnot_ps(dummy_mask,r21);
1111
1112             /* EWALD ELECTROSTATICS */
1113             
1114             /* Analytical PME correction */
1115             zeta2            = _mm256_mul_ps(beta2,rsq21);
1116             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1117             pmecorrF         = avx256_pmecorrF_f(zeta2);
1118             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1119             felec            = _mm256_mul_ps(qq21,felec);
1120             pmecorrV         = avx256_pmecorrV_f(zeta2);
1121             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1122             velec            = _mm256_sub_ps(rinv21,pmecorrV);
1123             velec            = _mm256_mul_ps(qq21,velec);
1124             
1125             /* Update potential sum for this i atom from the interaction with this j atom. */
1126             velec            = _mm256_andnot_ps(dummy_mask,velec);
1127             velecsum         = _mm256_add_ps(velecsum,velec);
1128
1129             fscal            = felec;
1130
1131             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1132
1133             /* Calculate temporary vectorial force */
1134             tx               = _mm256_mul_ps(fscal,dx21);
1135             ty               = _mm256_mul_ps(fscal,dy21);
1136             tz               = _mm256_mul_ps(fscal,dz21);
1137
1138             /* Update vectorial force */
1139             fix2             = _mm256_add_ps(fix2,tx);
1140             fiy2             = _mm256_add_ps(fiy2,ty);
1141             fiz2             = _mm256_add_ps(fiz2,tz);
1142
1143             fjx1             = _mm256_add_ps(fjx1,tx);
1144             fjy1             = _mm256_add_ps(fjy1,ty);
1145             fjz1             = _mm256_add_ps(fjz1,tz);
1146
1147             /**************************
1148              * CALCULATE INTERACTIONS *
1149              **************************/
1150
1151             r22              = _mm256_mul_ps(rsq22,rinv22);
1152             r22              = _mm256_andnot_ps(dummy_mask,r22);
1153
1154             /* EWALD ELECTROSTATICS */
1155             
1156             /* Analytical PME correction */
1157             zeta2            = _mm256_mul_ps(beta2,rsq22);
1158             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1159             pmecorrF         = avx256_pmecorrF_f(zeta2);
1160             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1161             felec            = _mm256_mul_ps(qq22,felec);
1162             pmecorrV         = avx256_pmecorrV_f(zeta2);
1163             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
1164             velec            = _mm256_sub_ps(rinv22,pmecorrV);
1165             velec            = _mm256_mul_ps(qq22,velec);
1166             
1167             /* Update potential sum for this i atom from the interaction with this j atom. */
1168             velec            = _mm256_andnot_ps(dummy_mask,velec);
1169             velecsum         = _mm256_add_ps(velecsum,velec);
1170
1171             fscal            = felec;
1172
1173             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1174
1175             /* Calculate temporary vectorial force */
1176             tx               = _mm256_mul_ps(fscal,dx22);
1177             ty               = _mm256_mul_ps(fscal,dy22);
1178             tz               = _mm256_mul_ps(fscal,dz22);
1179
1180             /* Update vectorial force */
1181             fix2             = _mm256_add_ps(fix2,tx);
1182             fiy2             = _mm256_add_ps(fiy2,ty);
1183             fiz2             = _mm256_add_ps(fiz2,tz);
1184
1185             fjx2             = _mm256_add_ps(fjx2,tx);
1186             fjy2             = _mm256_add_ps(fjy2,ty);
1187             fjz2             = _mm256_add_ps(fjz2,tz);
1188
1189             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1190             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1191             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1192             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1193             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1194             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1195             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1196             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1197
1198             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1199                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1200
1201             /* Inner loop uses 777 flops */
1202         }
1203
1204         /* End of innermost loop */
1205
1206         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1207                                                  f+i_coord_offset,fshift+i_shift_offset);
1208
1209         ggid                        = gid[iidx];
1210         /* Update potential energies */
1211         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1212         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1213
1214         /* Increment number of inner iterations */
1215         inneriter                  += j_index_end - j_index_start;
1216
1217         /* Outer loop uses 20 flops */
1218     }
1219
1220     /* Increment number of outer iterations */
1221     outeriter        += nri;
1222
1223     /* Update outer/inner flops */
1224
1225     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*777);
1226 }
1227 /*
1228  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_single
1229  * Electrostatics interaction: Ewald
1230  * VdW interaction:            LennardJones
1231  * Geometry:                   Water3-Water3
1232  * Calculate force/pot:        Force
1233  */
1234 void
1235 nb_kernel_ElecEw_VdwLJ_GeomW3W3_F_avx_256_single
1236                     (t_nblist                    * gmx_restrict       nlist,
1237                      rvec                        * gmx_restrict          xx,
1238                      rvec                        * gmx_restrict          ff,
1239                      struct t_forcerec           * gmx_restrict          fr,
1240                      t_mdatoms                   * gmx_restrict     mdatoms,
1241                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1242                      t_nrnb                      * gmx_restrict        nrnb)
1243 {
1244     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1245      * just 0 for non-waters.
1246      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1247      * jnr indices corresponding to data put in the four positions in the SIMD register.
1248      */
1249     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1250     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1251     int              jnrA,jnrB,jnrC,jnrD;
1252     int              jnrE,jnrF,jnrG,jnrH;
1253     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1254     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1255     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1256     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1257     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1258     real             rcutoff_scalar;
1259     real             *shiftvec,*fshift,*x,*f;
1260     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1261     real             scratch[4*DIM];
1262     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1263     real *           vdwioffsetptr0;
1264     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1265     real *           vdwioffsetptr1;
1266     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1267     real *           vdwioffsetptr2;
1268     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1269     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1270     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1271     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1272     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1273     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1274     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1275     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1276     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1277     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1278     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1279     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1280     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1281     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1282     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1283     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1284     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1285     real             *charge;
1286     int              nvdwtype;
1287     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1288     int              *vdwtype;
1289     real             *vdwparam;
1290     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1291     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1292     __m256i          ewitab;
1293     __m128i          ewitab_lo,ewitab_hi;
1294     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
1295     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
1296     real             *ewtab;
1297     __m256           dummy_mask,cutoff_mask;
1298     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1299     __m256           one     = _mm256_set1_ps(1.0);
1300     __m256           two     = _mm256_set1_ps(2.0);
1301     x                = xx[0];
1302     f                = ff[0];
1303
1304     nri              = nlist->nri;
1305     iinr             = nlist->iinr;
1306     jindex           = nlist->jindex;
1307     jjnr             = nlist->jjnr;
1308     shiftidx         = nlist->shift;
1309     gid              = nlist->gid;
1310     shiftvec         = fr->shift_vec[0];
1311     fshift           = fr->fshift[0];
1312     facel            = _mm256_set1_ps(fr->ic->epsfac);
1313     charge           = mdatoms->chargeA;
1314     nvdwtype         = fr->ntype;
1315     vdwparam         = fr->nbfp;
1316     vdwtype          = mdatoms->typeA;
1317
1318     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
1319     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
1320     beta2            = _mm256_mul_ps(beta,beta);
1321     beta3            = _mm256_mul_ps(beta,beta2);
1322
1323     ewtab            = fr->ic->tabq_coul_F;
1324     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
1325     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
1326
1327     /* Setup water-specific parameters */
1328     inr              = nlist->iinr[0];
1329     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1330     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1331     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1332     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1333
1334     jq0              = _mm256_set1_ps(charge[inr+0]);
1335     jq1              = _mm256_set1_ps(charge[inr+1]);
1336     jq2              = _mm256_set1_ps(charge[inr+2]);
1337     vdwjidx0A        = 2*vdwtype[inr+0];
1338     qq00             = _mm256_mul_ps(iq0,jq0);
1339     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1340     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1341     qq01             = _mm256_mul_ps(iq0,jq1);
1342     qq02             = _mm256_mul_ps(iq0,jq2);
1343     qq10             = _mm256_mul_ps(iq1,jq0);
1344     qq11             = _mm256_mul_ps(iq1,jq1);
1345     qq12             = _mm256_mul_ps(iq1,jq2);
1346     qq20             = _mm256_mul_ps(iq2,jq0);
1347     qq21             = _mm256_mul_ps(iq2,jq1);
1348     qq22             = _mm256_mul_ps(iq2,jq2);
1349
1350     /* Avoid stupid compiler warnings */
1351     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1352     j_coord_offsetA = 0;
1353     j_coord_offsetB = 0;
1354     j_coord_offsetC = 0;
1355     j_coord_offsetD = 0;
1356     j_coord_offsetE = 0;
1357     j_coord_offsetF = 0;
1358     j_coord_offsetG = 0;
1359     j_coord_offsetH = 0;
1360
1361     outeriter        = 0;
1362     inneriter        = 0;
1363
1364     for(iidx=0;iidx<4*DIM;iidx++)
1365     {
1366         scratch[iidx] = 0.0;
1367     }
1368
1369     /* Start outer loop over neighborlists */
1370     for(iidx=0; iidx<nri; iidx++)
1371     {
1372         /* Load shift vector for this list */
1373         i_shift_offset   = DIM*shiftidx[iidx];
1374
1375         /* Load limits for loop over neighbors */
1376         j_index_start    = jindex[iidx];
1377         j_index_end      = jindex[iidx+1];
1378
1379         /* Get outer coordinate index */
1380         inr              = iinr[iidx];
1381         i_coord_offset   = DIM*inr;
1382
1383         /* Load i particle coords and add shift vector */
1384         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1385                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1386
1387         fix0             = _mm256_setzero_ps();
1388         fiy0             = _mm256_setzero_ps();
1389         fiz0             = _mm256_setzero_ps();
1390         fix1             = _mm256_setzero_ps();
1391         fiy1             = _mm256_setzero_ps();
1392         fiz1             = _mm256_setzero_ps();
1393         fix2             = _mm256_setzero_ps();
1394         fiy2             = _mm256_setzero_ps();
1395         fiz2             = _mm256_setzero_ps();
1396
1397         /* Start inner kernel loop */
1398         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1399         {
1400
1401             /* Get j neighbor index, and coordinate index */
1402             jnrA             = jjnr[jidx];
1403             jnrB             = jjnr[jidx+1];
1404             jnrC             = jjnr[jidx+2];
1405             jnrD             = jjnr[jidx+3];
1406             jnrE             = jjnr[jidx+4];
1407             jnrF             = jjnr[jidx+5];
1408             jnrG             = jjnr[jidx+6];
1409             jnrH             = jjnr[jidx+7];
1410             j_coord_offsetA  = DIM*jnrA;
1411             j_coord_offsetB  = DIM*jnrB;
1412             j_coord_offsetC  = DIM*jnrC;
1413             j_coord_offsetD  = DIM*jnrD;
1414             j_coord_offsetE  = DIM*jnrE;
1415             j_coord_offsetF  = DIM*jnrF;
1416             j_coord_offsetG  = DIM*jnrG;
1417             j_coord_offsetH  = DIM*jnrH;
1418
1419             /* load j atom coordinates */
1420             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1421                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1422                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1423                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1424                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1425
1426             /* Calculate displacement vector */
1427             dx00             = _mm256_sub_ps(ix0,jx0);
1428             dy00             = _mm256_sub_ps(iy0,jy0);
1429             dz00             = _mm256_sub_ps(iz0,jz0);
1430             dx01             = _mm256_sub_ps(ix0,jx1);
1431             dy01             = _mm256_sub_ps(iy0,jy1);
1432             dz01             = _mm256_sub_ps(iz0,jz1);
1433             dx02             = _mm256_sub_ps(ix0,jx2);
1434             dy02             = _mm256_sub_ps(iy0,jy2);
1435             dz02             = _mm256_sub_ps(iz0,jz2);
1436             dx10             = _mm256_sub_ps(ix1,jx0);
1437             dy10             = _mm256_sub_ps(iy1,jy0);
1438             dz10             = _mm256_sub_ps(iz1,jz0);
1439             dx11             = _mm256_sub_ps(ix1,jx1);
1440             dy11             = _mm256_sub_ps(iy1,jy1);
1441             dz11             = _mm256_sub_ps(iz1,jz1);
1442             dx12             = _mm256_sub_ps(ix1,jx2);
1443             dy12             = _mm256_sub_ps(iy1,jy2);
1444             dz12             = _mm256_sub_ps(iz1,jz2);
1445             dx20             = _mm256_sub_ps(ix2,jx0);
1446             dy20             = _mm256_sub_ps(iy2,jy0);
1447             dz20             = _mm256_sub_ps(iz2,jz0);
1448             dx21             = _mm256_sub_ps(ix2,jx1);
1449             dy21             = _mm256_sub_ps(iy2,jy1);
1450             dz21             = _mm256_sub_ps(iz2,jz1);
1451             dx22             = _mm256_sub_ps(ix2,jx2);
1452             dy22             = _mm256_sub_ps(iy2,jy2);
1453             dz22             = _mm256_sub_ps(iz2,jz2);
1454
1455             /* Calculate squared distance and things based on it */
1456             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1457             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1458             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1459             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1460             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1461             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1462             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1463             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1464             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1465
1466             rinv00           = avx256_invsqrt_f(rsq00);
1467             rinv01           = avx256_invsqrt_f(rsq01);
1468             rinv02           = avx256_invsqrt_f(rsq02);
1469             rinv10           = avx256_invsqrt_f(rsq10);
1470             rinv11           = avx256_invsqrt_f(rsq11);
1471             rinv12           = avx256_invsqrt_f(rsq12);
1472             rinv20           = avx256_invsqrt_f(rsq20);
1473             rinv21           = avx256_invsqrt_f(rsq21);
1474             rinv22           = avx256_invsqrt_f(rsq22);
1475
1476             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1477             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1478             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1479             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1480             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1481             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1482             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1483             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1484             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1485
1486             fjx0             = _mm256_setzero_ps();
1487             fjy0             = _mm256_setzero_ps();
1488             fjz0             = _mm256_setzero_ps();
1489             fjx1             = _mm256_setzero_ps();
1490             fjy1             = _mm256_setzero_ps();
1491             fjz1             = _mm256_setzero_ps();
1492             fjx2             = _mm256_setzero_ps();
1493             fjy2             = _mm256_setzero_ps();
1494             fjz2             = _mm256_setzero_ps();
1495
1496             /**************************
1497              * CALCULATE INTERACTIONS *
1498              **************************/
1499
1500             r00              = _mm256_mul_ps(rsq00,rinv00);
1501
1502             /* EWALD ELECTROSTATICS */
1503             
1504             /* Analytical PME correction */
1505             zeta2            = _mm256_mul_ps(beta2,rsq00);
1506             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1507             pmecorrF         = avx256_pmecorrF_f(zeta2);
1508             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1509             felec            = _mm256_mul_ps(qq00,felec);
1510             
1511             /* LENNARD-JONES DISPERSION/REPULSION */
1512
1513             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1514             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1515
1516             fscal            = _mm256_add_ps(felec,fvdw);
1517
1518             /* Calculate temporary vectorial force */
1519             tx               = _mm256_mul_ps(fscal,dx00);
1520             ty               = _mm256_mul_ps(fscal,dy00);
1521             tz               = _mm256_mul_ps(fscal,dz00);
1522
1523             /* Update vectorial force */
1524             fix0             = _mm256_add_ps(fix0,tx);
1525             fiy0             = _mm256_add_ps(fiy0,ty);
1526             fiz0             = _mm256_add_ps(fiz0,tz);
1527
1528             fjx0             = _mm256_add_ps(fjx0,tx);
1529             fjy0             = _mm256_add_ps(fjy0,ty);
1530             fjz0             = _mm256_add_ps(fjz0,tz);
1531
1532             /**************************
1533              * CALCULATE INTERACTIONS *
1534              **************************/
1535
1536             r01              = _mm256_mul_ps(rsq01,rinv01);
1537
1538             /* EWALD ELECTROSTATICS */
1539             
1540             /* Analytical PME correction */
1541             zeta2            = _mm256_mul_ps(beta2,rsq01);
1542             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1543             pmecorrF         = avx256_pmecorrF_f(zeta2);
1544             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1545             felec            = _mm256_mul_ps(qq01,felec);
1546             
1547             fscal            = felec;
1548
1549             /* Calculate temporary vectorial force */
1550             tx               = _mm256_mul_ps(fscal,dx01);
1551             ty               = _mm256_mul_ps(fscal,dy01);
1552             tz               = _mm256_mul_ps(fscal,dz01);
1553
1554             /* Update vectorial force */
1555             fix0             = _mm256_add_ps(fix0,tx);
1556             fiy0             = _mm256_add_ps(fiy0,ty);
1557             fiz0             = _mm256_add_ps(fiz0,tz);
1558
1559             fjx1             = _mm256_add_ps(fjx1,tx);
1560             fjy1             = _mm256_add_ps(fjy1,ty);
1561             fjz1             = _mm256_add_ps(fjz1,tz);
1562
1563             /**************************
1564              * CALCULATE INTERACTIONS *
1565              **************************/
1566
1567             r02              = _mm256_mul_ps(rsq02,rinv02);
1568
1569             /* EWALD ELECTROSTATICS */
1570             
1571             /* Analytical PME correction */
1572             zeta2            = _mm256_mul_ps(beta2,rsq02);
1573             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1574             pmecorrF         = avx256_pmecorrF_f(zeta2);
1575             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1576             felec            = _mm256_mul_ps(qq02,felec);
1577             
1578             fscal            = felec;
1579
1580             /* Calculate temporary vectorial force */
1581             tx               = _mm256_mul_ps(fscal,dx02);
1582             ty               = _mm256_mul_ps(fscal,dy02);
1583             tz               = _mm256_mul_ps(fscal,dz02);
1584
1585             /* Update vectorial force */
1586             fix0             = _mm256_add_ps(fix0,tx);
1587             fiy0             = _mm256_add_ps(fiy0,ty);
1588             fiz0             = _mm256_add_ps(fiz0,tz);
1589
1590             fjx2             = _mm256_add_ps(fjx2,tx);
1591             fjy2             = _mm256_add_ps(fjy2,ty);
1592             fjz2             = _mm256_add_ps(fjz2,tz);
1593
1594             /**************************
1595              * CALCULATE INTERACTIONS *
1596              **************************/
1597
1598             r10              = _mm256_mul_ps(rsq10,rinv10);
1599
1600             /* EWALD ELECTROSTATICS */
1601             
1602             /* Analytical PME correction */
1603             zeta2            = _mm256_mul_ps(beta2,rsq10);
1604             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1605             pmecorrF         = avx256_pmecorrF_f(zeta2);
1606             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1607             felec            = _mm256_mul_ps(qq10,felec);
1608             
1609             fscal            = felec;
1610
1611             /* Calculate temporary vectorial force */
1612             tx               = _mm256_mul_ps(fscal,dx10);
1613             ty               = _mm256_mul_ps(fscal,dy10);
1614             tz               = _mm256_mul_ps(fscal,dz10);
1615
1616             /* Update vectorial force */
1617             fix1             = _mm256_add_ps(fix1,tx);
1618             fiy1             = _mm256_add_ps(fiy1,ty);
1619             fiz1             = _mm256_add_ps(fiz1,tz);
1620
1621             fjx0             = _mm256_add_ps(fjx0,tx);
1622             fjy0             = _mm256_add_ps(fjy0,ty);
1623             fjz0             = _mm256_add_ps(fjz0,tz);
1624
1625             /**************************
1626              * CALCULATE INTERACTIONS *
1627              **************************/
1628
1629             r11              = _mm256_mul_ps(rsq11,rinv11);
1630
1631             /* EWALD ELECTROSTATICS */
1632             
1633             /* Analytical PME correction */
1634             zeta2            = _mm256_mul_ps(beta2,rsq11);
1635             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
1636             pmecorrF         = avx256_pmecorrF_f(zeta2);
1637             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1638             felec            = _mm256_mul_ps(qq11,felec);
1639             
1640             fscal            = felec;
1641
1642             /* Calculate temporary vectorial force */
1643             tx               = _mm256_mul_ps(fscal,dx11);
1644             ty               = _mm256_mul_ps(fscal,dy11);
1645             tz               = _mm256_mul_ps(fscal,dz11);
1646
1647             /* Update vectorial force */
1648             fix1             = _mm256_add_ps(fix1,tx);
1649             fiy1             = _mm256_add_ps(fiy1,ty);
1650             fiz1             = _mm256_add_ps(fiz1,tz);
1651
1652             fjx1             = _mm256_add_ps(fjx1,tx);
1653             fjy1             = _mm256_add_ps(fjy1,ty);
1654             fjz1             = _mm256_add_ps(fjz1,tz);
1655
1656             /**************************
1657              * CALCULATE INTERACTIONS *
1658              **************************/
1659
1660             r12              = _mm256_mul_ps(rsq12,rinv12);
1661
1662             /* EWALD ELECTROSTATICS */
1663             
1664             /* Analytical PME correction */
1665             zeta2            = _mm256_mul_ps(beta2,rsq12);
1666             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
1667             pmecorrF         = avx256_pmecorrF_f(zeta2);
1668             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1669             felec            = _mm256_mul_ps(qq12,felec);
1670             
1671             fscal            = felec;
1672
1673             /* Calculate temporary vectorial force */
1674             tx               = _mm256_mul_ps(fscal,dx12);
1675             ty               = _mm256_mul_ps(fscal,dy12);
1676             tz               = _mm256_mul_ps(fscal,dz12);
1677
1678             /* Update vectorial force */
1679             fix1             = _mm256_add_ps(fix1,tx);
1680             fiy1             = _mm256_add_ps(fiy1,ty);
1681             fiz1             = _mm256_add_ps(fiz1,tz);
1682
1683             fjx2             = _mm256_add_ps(fjx2,tx);
1684             fjy2             = _mm256_add_ps(fjy2,ty);
1685             fjz2             = _mm256_add_ps(fjz2,tz);
1686
1687             /**************************
1688              * CALCULATE INTERACTIONS *
1689              **************************/
1690
1691             r20              = _mm256_mul_ps(rsq20,rinv20);
1692
1693             /* EWALD ELECTROSTATICS */
1694             
1695             /* Analytical PME correction */
1696             zeta2            = _mm256_mul_ps(beta2,rsq20);
1697             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1698             pmecorrF         = avx256_pmecorrF_f(zeta2);
1699             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1700             felec            = _mm256_mul_ps(qq20,felec);
1701             
1702             fscal            = felec;
1703
1704             /* Calculate temporary vectorial force */
1705             tx               = _mm256_mul_ps(fscal,dx20);
1706             ty               = _mm256_mul_ps(fscal,dy20);
1707             tz               = _mm256_mul_ps(fscal,dz20);
1708
1709             /* Update vectorial force */
1710             fix2             = _mm256_add_ps(fix2,tx);
1711             fiy2             = _mm256_add_ps(fiy2,ty);
1712             fiz2             = _mm256_add_ps(fiz2,tz);
1713
1714             fjx0             = _mm256_add_ps(fjx0,tx);
1715             fjy0             = _mm256_add_ps(fjy0,ty);
1716             fjz0             = _mm256_add_ps(fjz0,tz);
1717
1718             /**************************
1719              * CALCULATE INTERACTIONS *
1720              **************************/
1721
1722             r21              = _mm256_mul_ps(rsq21,rinv21);
1723
1724             /* EWALD ELECTROSTATICS */
1725             
1726             /* Analytical PME correction */
1727             zeta2            = _mm256_mul_ps(beta2,rsq21);
1728             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
1729             pmecorrF         = avx256_pmecorrF_f(zeta2);
1730             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1731             felec            = _mm256_mul_ps(qq21,felec);
1732             
1733             fscal            = felec;
1734
1735             /* Calculate temporary vectorial force */
1736             tx               = _mm256_mul_ps(fscal,dx21);
1737             ty               = _mm256_mul_ps(fscal,dy21);
1738             tz               = _mm256_mul_ps(fscal,dz21);
1739
1740             /* Update vectorial force */
1741             fix2             = _mm256_add_ps(fix2,tx);
1742             fiy2             = _mm256_add_ps(fiy2,ty);
1743             fiz2             = _mm256_add_ps(fiz2,tz);
1744
1745             fjx1             = _mm256_add_ps(fjx1,tx);
1746             fjy1             = _mm256_add_ps(fjy1,ty);
1747             fjz1             = _mm256_add_ps(fjz1,tz);
1748
1749             /**************************
1750              * CALCULATE INTERACTIONS *
1751              **************************/
1752
1753             r22              = _mm256_mul_ps(rsq22,rinv22);
1754
1755             /* EWALD ELECTROSTATICS */
1756             
1757             /* Analytical PME correction */
1758             zeta2            = _mm256_mul_ps(beta2,rsq22);
1759             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
1760             pmecorrF         = avx256_pmecorrF_f(zeta2);
1761             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1762             felec            = _mm256_mul_ps(qq22,felec);
1763             
1764             fscal            = felec;
1765
1766             /* Calculate temporary vectorial force */
1767             tx               = _mm256_mul_ps(fscal,dx22);
1768             ty               = _mm256_mul_ps(fscal,dy22);
1769             tz               = _mm256_mul_ps(fscal,dz22);
1770
1771             /* Update vectorial force */
1772             fix2             = _mm256_add_ps(fix2,tx);
1773             fiy2             = _mm256_add_ps(fiy2,ty);
1774             fiz2             = _mm256_add_ps(fiz2,tz);
1775
1776             fjx2             = _mm256_add_ps(fjx2,tx);
1777             fjy2             = _mm256_add_ps(fjy2,ty);
1778             fjz2             = _mm256_add_ps(fjz2,tz);
1779
1780             fjptrA             = f+j_coord_offsetA;
1781             fjptrB             = f+j_coord_offsetB;
1782             fjptrC             = f+j_coord_offsetC;
1783             fjptrD             = f+j_coord_offsetD;
1784             fjptrE             = f+j_coord_offsetE;
1785             fjptrF             = f+j_coord_offsetF;
1786             fjptrG             = f+j_coord_offsetG;
1787             fjptrH             = f+j_coord_offsetH;
1788
1789             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1790                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1791
1792             /* Inner loop uses 511 flops */
1793         }
1794
1795         if(jidx<j_index_end)
1796         {
1797
1798             /* Get j neighbor index, and coordinate index */
1799             jnrlistA         = jjnr[jidx];
1800             jnrlistB         = jjnr[jidx+1];
1801             jnrlistC         = jjnr[jidx+2];
1802             jnrlistD         = jjnr[jidx+3];
1803             jnrlistE         = jjnr[jidx+4];
1804             jnrlistF         = jjnr[jidx+5];
1805             jnrlistG         = jjnr[jidx+6];
1806             jnrlistH         = jjnr[jidx+7];
1807             /* Sign of each element will be negative for non-real atoms.
1808              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1809              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1810              */
1811             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1812                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1813                                             
1814             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1815             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1816             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1817             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1818             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1819             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1820             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1821             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1822             j_coord_offsetA  = DIM*jnrA;
1823             j_coord_offsetB  = DIM*jnrB;
1824             j_coord_offsetC  = DIM*jnrC;
1825             j_coord_offsetD  = DIM*jnrD;
1826             j_coord_offsetE  = DIM*jnrE;
1827             j_coord_offsetF  = DIM*jnrF;
1828             j_coord_offsetG  = DIM*jnrG;
1829             j_coord_offsetH  = DIM*jnrH;
1830
1831             /* load j atom coordinates */
1832             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1833                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1834                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1835                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1836                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1837
1838             /* Calculate displacement vector */
1839             dx00             = _mm256_sub_ps(ix0,jx0);
1840             dy00             = _mm256_sub_ps(iy0,jy0);
1841             dz00             = _mm256_sub_ps(iz0,jz0);
1842             dx01             = _mm256_sub_ps(ix0,jx1);
1843             dy01             = _mm256_sub_ps(iy0,jy1);
1844             dz01             = _mm256_sub_ps(iz0,jz1);
1845             dx02             = _mm256_sub_ps(ix0,jx2);
1846             dy02             = _mm256_sub_ps(iy0,jy2);
1847             dz02             = _mm256_sub_ps(iz0,jz2);
1848             dx10             = _mm256_sub_ps(ix1,jx0);
1849             dy10             = _mm256_sub_ps(iy1,jy0);
1850             dz10             = _mm256_sub_ps(iz1,jz0);
1851             dx11             = _mm256_sub_ps(ix1,jx1);
1852             dy11             = _mm256_sub_ps(iy1,jy1);
1853             dz11             = _mm256_sub_ps(iz1,jz1);
1854             dx12             = _mm256_sub_ps(ix1,jx2);
1855             dy12             = _mm256_sub_ps(iy1,jy2);
1856             dz12             = _mm256_sub_ps(iz1,jz2);
1857             dx20             = _mm256_sub_ps(ix2,jx0);
1858             dy20             = _mm256_sub_ps(iy2,jy0);
1859             dz20             = _mm256_sub_ps(iz2,jz0);
1860             dx21             = _mm256_sub_ps(ix2,jx1);
1861             dy21             = _mm256_sub_ps(iy2,jy1);
1862             dz21             = _mm256_sub_ps(iz2,jz1);
1863             dx22             = _mm256_sub_ps(ix2,jx2);
1864             dy22             = _mm256_sub_ps(iy2,jy2);
1865             dz22             = _mm256_sub_ps(iz2,jz2);
1866
1867             /* Calculate squared distance and things based on it */
1868             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1869             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1870             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1871             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1872             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1873             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1874             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1875             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1876             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1877
1878             rinv00           = avx256_invsqrt_f(rsq00);
1879             rinv01           = avx256_invsqrt_f(rsq01);
1880             rinv02           = avx256_invsqrt_f(rsq02);
1881             rinv10           = avx256_invsqrt_f(rsq10);
1882             rinv11           = avx256_invsqrt_f(rsq11);
1883             rinv12           = avx256_invsqrt_f(rsq12);
1884             rinv20           = avx256_invsqrt_f(rsq20);
1885             rinv21           = avx256_invsqrt_f(rsq21);
1886             rinv22           = avx256_invsqrt_f(rsq22);
1887
1888             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1889             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1890             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1891             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1892             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1893             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1894             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1895             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1896             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1897
1898             fjx0             = _mm256_setzero_ps();
1899             fjy0             = _mm256_setzero_ps();
1900             fjz0             = _mm256_setzero_ps();
1901             fjx1             = _mm256_setzero_ps();
1902             fjy1             = _mm256_setzero_ps();
1903             fjz1             = _mm256_setzero_ps();
1904             fjx2             = _mm256_setzero_ps();
1905             fjy2             = _mm256_setzero_ps();
1906             fjz2             = _mm256_setzero_ps();
1907
1908             /**************************
1909              * CALCULATE INTERACTIONS *
1910              **************************/
1911
1912             r00              = _mm256_mul_ps(rsq00,rinv00);
1913             r00              = _mm256_andnot_ps(dummy_mask,r00);
1914
1915             /* EWALD ELECTROSTATICS */
1916             
1917             /* Analytical PME correction */
1918             zeta2            = _mm256_mul_ps(beta2,rsq00);
1919             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1920             pmecorrF         = avx256_pmecorrF_f(zeta2);
1921             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1922             felec            = _mm256_mul_ps(qq00,felec);
1923             
1924             /* LENNARD-JONES DISPERSION/REPULSION */
1925
1926             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1927             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1928
1929             fscal            = _mm256_add_ps(felec,fvdw);
1930
1931             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1932
1933             /* Calculate temporary vectorial force */
1934             tx               = _mm256_mul_ps(fscal,dx00);
1935             ty               = _mm256_mul_ps(fscal,dy00);
1936             tz               = _mm256_mul_ps(fscal,dz00);
1937
1938             /* Update vectorial force */
1939             fix0             = _mm256_add_ps(fix0,tx);
1940             fiy0             = _mm256_add_ps(fiy0,ty);
1941             fiz0             = _mm256_add_ps(fiz0,tz);
1942
1943             fjx0             = _mm256_add_ps(fjx0,tx);
1944             fjy0             = _mm256_add_ps(fjy0,ty);
1945             fjz0             = _mm256_add_ps(fjz0,tz);
1946
1947             /**************************
1948              * CALCULATE INTERACTIONS *
1949              **************************/
1950
1951             r01              = _mm256_mul_ps(rsq01,rinv01);
1952             r01              = _mm256_andnot_ps(dummy_mask,r01);
1953
1954             /* EWALD ELECTROSTATICS */
1955             
1956             /* Analytical PME correction */
1957             zeta2            = _mm256_mul_ps(beta2,rsq01);
1958             rinv3            = _mm256_mul_ps(rinvsq01,rinv01);
1959             pmecorrF         = avx256_pmecorrF_f(zeta2);
1960             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1961             felec            = _mm256_mul_ps(qq01,felec);
1962             
1963             fscal            = felec;
1964
1965             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1966
1967             /* Calculate temporary vectorial force */
1968             tx               = _mm256_mul_ps(fscal,dx01);
1969             ty               = _mm256_mul_ps(fscal,dy01);
1970             tz               = _mm256_mul_ps(fscal,dz01);
1971
1972             /* Update vectorial force */
1973             fix0             = _mm256_add_ps(fix0,tx);
1974             fiy0             = _mm256_add_ps(fiy0,ty);
1975             fiz0             = _mm256_add_ps(fiz0,tz);
1976
1977             fjx1             = _mm256_add_ps(fjx1,tx);
1978             fjy1             = _mm256_add_ps(fjy1,ty);
1979             fjz1             = _mm256_add_ps(fjz1,tz);
1980
1981             /**************************
1982              * CALCULATE INTERACTIONS *
1983              **************************/
1984
1985             r02              = _mm256_mul_ps(rsq02,rinv02);
1986             r02              = _mm256_andnot_ps(dummy_mask,r02);
1987
1988             /* EWALD ELECTROSTATICS */
1989             
1990             /* Analytical PME correction */
1991             zeta2            = _mm256_mul_ps(beta2,rsq02);
1992             rinv3            = _mm256_mul_ps(rinvsq02,rinv02);
1993             pmecorrF         = avx256_pmecorrF_f(zeta2);
1994             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1995             felec            = _mm256_mul_ps(qq02,felec);
1996             
1997             fscal            = felec;
1998
1999             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2000
2001             /* Calculate temporary vectorial force */
2002             tx               = _mm256_mul_ps(fscal,dx02);
2003             ty               = _mm256_mul_ps(fscal,dy02);
2004             tz               = _mm256_mul_ps(fscal,dz02);
2005
2006             /* Update vectorial force */
2007             fix0             = _mm256_add_ps(fix0,tx);
2008             fiy0             = _mm256_add_ps(fiy0,ty);
2009             fiz0             = _mm256_add_ps(fiz0,tz);
2010
2011             fjx2             = _mm256_add_ps(fjx2,tx);
2012             fjy2             = _mm256_add_ps(fjy2,ty);
2013             fjz2             = _mm256_add_ps(fjz2,tz);
2014
2015             /**************************
2016              * CALCULATE INTERACTIONS *
2017              **************************/
2018
2019             r10              = _mm256_mul_ps(rsq10,rinv10);
2020             r10              = _mm256_andnot_ps(dummy_mask,r10);
2021
2022             /* EWALD ELECTROSTATICS */
2023             
2024             /* Analytical PME correction */
2025             zeta2            = _mm256_mul_ps(beta2,rsq10);
2026             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
2027             pmecorrF         = avx256_pmecorrF_f(zeta2);
2028             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2029             felec            = _mm256_mul_ps(qq10,felec);
2030             
2031             fscal            = felec;
2032
2033             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2034
2035             /* Calculate temporary vectorial force */
2036             tx               = _mm256_mul_ps(fscal,dx10);
2037             ty               = _mm256_mul_ps(fscal,dy10);
2038             tz               = _mm256_mul_ps(fscal,dz10);
2039
2040             /* Update vectorial force */
2041             fix1             = _mm256_add_ps(fix1,tx);
2042             fiy1             = _mm256_add_ps(fiy1,ty);
2043             fiz1             = _mm256_add_ps(fiz1,tz);
2044
2045             fjx0             = _mm256_add_ps(fjx0,tx);
2046             fjy0             = _mm256_add_ps(fjy0,ty);
2047             fjz0             = _mm256_add_ps(fjz0,tz);
2048
2049             /**************************
2050              * CALCULATE INTERACTIONS *
2051              **************************/
2052
2053             r11              = _mm256_mul_ps(rsq11,rinv11);
2054             r11              = _mm256_andnot_ps(dummy_mask,r11);
2055
2056             /* EWALD ELECTROSTATICS */
2057             
2058             /* Analytical PME correction */
2059             zeta2            = _mm256_mul_ps(beta2,rsq11);
2060             rinv3            = _mm256_mul_ps(rinvsq11,rinv11);
2061             pmecorrF         = avx256_pmecorrF_f(zeta2);
2062             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2063             felec            = _mm256_mul_ps(qq11,felec);
2064             
2065             fscal            = felec;
2066
2067             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2068
2069             /* Calculate temporary vectorial force */
2070             tx               = _mm256_mul_ps(fscal,dx11);
2071             ty               = _mm256_mul_ps(fscal,dy11);
2072             tz               = _mm256_mul_ps(fscal,dz11);
2073
2074             /* Update vectorial force */
2075             fix1             = _mm256_add_ps(fix1,tx);
2076             fiy1             = _mm256_add_ps(fiy1,ty);
2077             fiz1             = _mm256_add_ps(fiz1,tz);
2078
2079             fjx1             = _mm256_add_ps(fjx1,tx);
2080             fjy1             = _mm256_add_ps(fjy1,ty);
2081             fjz1             = _mm256_add_ps(fjz1,tz);
2082
2083             /**************************
2084              * CALCULATE INTERACTIONS *
2085              **************************/
2086
2087             r12              = _mm256_mul_ps(rsq12,rinv12);
2088             r12              = _mm256_andnot_ps(dummy_mask,r12);
2089
2090             /* EWALD ELECTROSTATICS */
2091             
2092             /* Analytical PME correction */
2093             zeta2            = _mm256_mul_ps(beta2,rsq12);
2094             rinv3            = _mm256_mul_ps(rinvsq12,rinv12);
2095             pmecorrF         = avx256_pmecorrF_f(zeta2);
2096             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2097             felec            = _mm256_mul_ps(qq12,felec);
2098             
2099             fscal            = felec;
2100
2101             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2102
2103             /* Calculate temporary vectorial force */
2104             tx               = _mm256_mul_ps(fscal,dx12);
2105             ty               = _mm256_mul_ps(fscal,dy12);
2106             tz               = _mm256_mul_ps(fscal,dz12);
2107
2108             /* Update vectorial force */
2109             fix1             = _mm256_add_ps(fix1,tx);
2110             fiy1             = _mm256_add_ps(fiy1,ty);
2111             fiz1             = _mm256_add_ps(fiz1,tz);
2112
2113             fjx2             = _mm256_add_ps(fjx2,tx);
2114             fjy2             = _mm256_add_ps(fjy2,ty);
2115             fjz2             = _mm256_add_ps(fjz2,tz);
2116
2117             /**************************
2118              * CALCULATE INTERACTIONS *
2119              **************************/
2120
2121             r20              = _mm256_mul_ps(rsq20,rinv20);
2122             r20              = _mm256_andnot_ps(dummy_mask,r20);
2123
2124             /* EWALD ELECTROSTATICS */
2125             
2126             /* Analytical PME correction */
2127             zeta2            = _mm256_mul_ps(beta2,rsq20);
2128             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
2129             pmecorrF         = avx256_pmecorrF_f(zeta2);
2130             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2131             felec            = _mm256_mul_ps(qq20,felec);
2132             
2133             fscal            = felec;
2134
2135             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2136
2137             /* Calculate temporary vectorial force */
2138             tx               = _mm256_mul_ps(fscal,dx20);
2139             ty               = _mm256_mul_ps(fscal,dy20);
2140             tz               = _mm256_mul_ps(fscal,dz20);
2141
2142             /* Update vectorial force */
2143             fix2             = _mm256_add_ps(fix2,tx);
2144             fiy2             = _mm256_add_ps(fiy2,ty);
2145             fiz2             = _mm256_add_ps(fiz2,tz);
2146
2147             fjx0             = _mm256_add_ps(fjx0,tx);
2148             fjy0             = _mm256_add_ps(fjy0,ty);
2149             fjz0             = _mm256_add_ps(fjz0,tz);
2150
2151             /**************************
2152              * CALCULATE INTERACTIONS *
2153              **************************/
2154
2155             r21              = _mm256_mul_ps(rsq21,rinv21);
2156             r21              = _mm256_andnot_ps(dummy_mask,r21);
2157
2158             /* EWALD ELECTROSTATICS */
2159             
2160             /* Analytical PME correction */
2161             zeta2            = _mm256_mul_ps(beta2,rsq21);
2162             rinv3            = _mm256_mul_ps(rinvsq21,rinv21);
2163             pmecorrF         = avx256_pmecorrF_f(zeta2);
2164             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2165             felec            = _mm256_mul_ps(qq21,felec);
2166             
2167             fscal            = felec;
2168
2169             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2170
2171             /* Calculate temporary vectorial force */
2172             tx               = _mm256_mul_ps(fscal,dx21);
2173             ty               = _mm256_mul_ps(fscal,dy21);
2174             tz               = _mm256_mul_ps(fscal,dz21);
2175
2176             /* Update vectorial force */
2177             fix2             = _mm256_add_ps(fix2,tx);
2178             fiy2             = _mm256_add_ps(fiy2,ty);
2179             fiz2             = _mm256_add_ps(fiz2,tz);
2180
2181             fjx1             = _mm256_add_ps(fjx1,tx);
2182             fjy1             = _mm256_add_ps(fjy1,ty);
2183             fjz1             = _mm256_add_ps(fjz1,tz);
2184
2185             /**************************
2186              * CALCULATE INTERACTIONS *
2187              **************************/
2188
2189             r22              = _mm256_mul_ps(rsq22,rinv22);
2190             r22              = _mm256_andnot_ps(dummy_mask,r22);
2191
2192             /* EWALD ELECTROSTATICS */
2193             
2194             /* Analytical PME correction */
2195             zeta2            = _mm256_mul_ps(beta2,rsq22);
2196             rinv3            = _mm256_mul_ps(rinvsq22,rinv22);
2197             pmecorrF         = avx256_pmecorrF_f(zeta2);
2198             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
2199             felec            = _mm256_mul_ps(qq22,felec);
2200             
2201             fscal            = felec;
2202
2203             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
2204
2205             /* Calculate temporary vectorial force */
2206             tx               = _mm256_mul_ps(fscal,dx22);
2207             ty               = _mm256_mul_ps(fscal,dy22);
2208             tz               = _mm256_mul_ps(fscal,dz22);
2209
2210             /* Update vectorial force */
2211             fix2             = _mm256_add_ps(fix2,tx);
2212             fiy2             = _mm256_add_ps(fiy2,ty);
2213             fiz2             = _mm256_add_ps(fiz2,tz);
2214
2215             fjx2             = _mm256_add_ps(fjx2,tx);
2216             fjy2             = _mm256_add_ps(fjy2,ty);
2217             fjz2             = _mm256_add_ps(fjz2,tz);
2218
2219             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2220             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2221             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2222             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2223             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2224             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2225             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2226             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2227
2228             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2229                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2230
2231             /* Inner loop uses 520 flops */
2232         }
2233
2234         /* End of innermost loop */
2235
2236         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2237                                                  f+i_coord_offset,fshift+i_shift_offset);
2238
2239         /* Increment number of inner iterations */
2240         inneriter                  += j_index_end - j_index_start;
2241
2242         /* Outer loop uses 18 flops */
2243     }
2244
2245     /* Increment number of outer iterations */
2246     outeriter        += nri;
2247
2248     /* Update outer/inner flops */
2249
2250     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*520);
2251 }