Merge release-5-0 into master
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecCoul_VdwLJ_GeomW3W3_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_single
54  * Electrostatics interaction: Coulomb
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water3-Water3
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrE,jnrF,jnrG,jnrH;
78     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
83     real             rcutoff_scalar;
84     real             *shiftvec,*fshift,*x,*f;
85     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
86     real             scratch[4*DIM];
87     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88     real *           vdwioffsetptr0;
89     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90     real *           vdwioffsetptr1;
91     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92     real *           vdwioffsetptr2;
93     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
102     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
103     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
104     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
105     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
106     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
107     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
108     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
109     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
110     real             *charge;
111     int              nvdwtype;
112     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113     int              *vdwtype;
114     real             *vdwparam;
115     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
116     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
117     __m256           dummy_mask,cutoff_mask;
118     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
119     __m256           one     = _mm256_set1_ps(1.0);
120     __m256           two     = _mm256_set1_ps(2.0);
121     x                = xx[0];
122     f                = ff[0];
123
124     nri              = nlist->nri;
125     iinr             = nlist->iinr;
126     jindex           = nlist->jindex;
127     jjnr             = nlist->jjnr;
128     shiftidx         = nlist->shift;
129     gid              = nlist->gid;
130     shiftvec         = fr->shift_vec[0];
131     fshift           = fr->fshift[0];
132     facel            = _mm256_set1_ps(fr->epsfac);
133     charge           = mdatoms->chargeA;
134     nvdwtype         = fr->ntype;
135     vdwparam         = fr->nbfp;
136     vdwtype          = mdatoms->typeA;
137
138     /* Setup water-specific parameters */
139     inr              = nlist->iinr[0];
140     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
141     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
142     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
143     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
144
145     jq0              = _mm256_set1_ps(charge[inr+0]);
146     jq1              = _mm256_set1_ps(charge[inr+1]);
147     jq2              = _mm256_set1_ps(charge[inr+2]);
148     vdwjidx0A        = 2*vdwtype[inr+0];
149     qq00             = _mm256_mul_ps(iq0,jq0);
150     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
151     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
152     qq01             = _mm256_mul_ps(iq0,jq1);
153     qq02             = _mm256_mul_ps(iq0,jq2);
154     qq10             = _mm256_mul_ps(iq1,jq0);
155     qq11             = _mm256_mul_ps(iq1,jq1);
156     qq12             = _mm256_mul_ps(iq1,jq2);
157     qq20             = _mm256_mul_ps(iq2,jq0);
158     qq21             = _mm256_mul_ps(iq2,jq1);
159     qq22             = _mm256_mul_ps(iq2,jq2);
160
161     /* Avoid stupid compiler warnings */
162     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
163     j_coord_offsetA = 0;
164     j_coord_offsetB = 0;
165     j_coord_offsetC = 0;
166     j_coord_offsetD = 0;
167     j_coord_offsetE = 0;
168     j_coord_offsetF = 0;
169     j_coord_offsetG = 0;
170     j_coord_offsetH = 0;
171
172     outeriter        = 0;
173     inneriter        = 0;
174
175     for(iidx=0;iidx<4*DIM;iidx++)
176     {
177         scratch[iidx] = 0.0;
178     }
179
180     /* Start outer loop over neighborlists */
181     for(iidx=0; iidx<nri; iidx++)
182     {
183         /* Load shift vector for this list */
184         i_shift_offset   = DIM*shiftidx[iidx];
185
186         /* Load limits for loop over neighbors */
187         j_index_start    = jindex[iidx];
188         j_index_end      = jindex[iidx+1];
189
190         /* Get outer coordinate index */
191         inr              = iinr[iidx];
192         i_coord_offset   = DIM*inr;
193
194         /* Load i particle coords and add shift vector */
195         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
196                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
197
198         fix0             = _mm256_setzero_ps();
199         fiy0             = _mm256_setzero_ps();
200         fiz0             = _mm256_setzero_ps();
201         fix1             = _mm256_setzero_ps();
202         fiy1             = _mm256_setzero_ps();
203         fiz1             = _mm256_setzero_ps();
204         fix2             = _mm256_setzero_ps();
205         fiy2             = _mm256_setzero_ps();
206         fiz2             = _mm256_setzero_ps();
207
208         /* Reset potential sums */
209         velecsum         = _mm256_setzero_ps();
210         vvdwsum          = _mm256_setzero_ps();
211
212         /* Start inner kernel loop */
213         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
214         {
215
216             /* Get j neighbor index, and coordinate index */
217             jnrA             = jjnr[jidx];
218             jnrB             = jjnr[jidx+1];
219             jnrC             = jjnr[jidx+2];
220             jnrD             = jjnr[jidx+3];
221             jnrE             = jjnr[jidx+4];
222             jnrF             = jjnr[jidx+5];
223             jnrG             = jjnr[jidx+6];
224             jnrH             = jjnr[jidx+7];
225             j_coord_offsetA  = DIM*jnrA;
226             j_coord_offsetB  = DIM*jnrB;
227             j_coord_offsetC  = DIM*jnrC;
228             j_coord_offsetD  = DIM*jnrD;
229             j_coord_offsetE  = DIM*jnrE;
230             j_coord_offsetF  = DIM*jnrF;
231             j_coord_offsetG  = DIM*jnrG;
232             j_coord_offsetH  = DIM*jnrH;
233
234             /* load j atom coordinates */
235             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
236                                                  x+j_coord_offsetC,x+j_coord_offsetD,
237                                                  x+j_coord_offsetE,x+j_coord_offsetF,
238                                                  x+j_coord_offsetG,x+j_coord_offsetH,
239                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
240
241             /* Calculate displacement vector */
242             dx00             = _mm256_sub_ps(ix0,jx0);
243             dy00             = _mm256_sub_ps(iy0,jy0);
244             dz00             = _mm256_sub_ps(iz0,jz0);
245             dx01             = _mm256_sub_ps(ix0,jx1);
246             dy01             = _mm256_sub_ps(iy0,jy1);
247             dz01             = _mm256_sub_ps(iz0,jz1);
248             dx02             = _mm256_sub_ps(ix0,jx2);
249             dy02             = _mm256_sub_ps(iy0,jy2);
250             dz02             = _mm256_sub_ps(iz0,jz2);
251             dx10             = _mm256_sub_ps(ix1,jx0);
252             dy10             = _mm256_sub_ps(iy1,jy0);
253             dz10             = _mm256_sub_ps(iz1,jz0);
254             dx11             = _mm256_sub_ps(ix1,jx1);
255             dy11             = _mm256_sub_ps(iy1,jy1);
256             dz11             = _mm256_sub_ps(iz1,jz1);
257             dx12             = _mm256_sub_ps(ix1,jx2);
258             dy12             = _mm256_sub_ps(iy1,jy2);
259             dz12             = _mm256_sub_ps(iz1,jz2);
260             dx20             = _mm256_sub_ps(ix2,jx0);
261             dy20             = _mm256_sub_ps(iy2,jy0);
262             dz20             = _mm256_sub_ps(iz2,jz0);
263             dx21             = _mm256_sub_ps(ix2,jx1);
264             dy21             = _mm256_sub_ps(iy2,jy1);
265             dz21             = _mm256_sub_ps(iz2,jz1);
266             dx22             = _mm256_sub_ps(ix2,jx2);
267             dy22             = _mm256_sub_ps(iy2,jy2);
268             dz22             = _mm256_sub_ps(iz2,jz2);
269
270             /* Calculate squared distance and things based on it */
271             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
272             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
273             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
274             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
275             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
276             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
277             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
278             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
279             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
280
281             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
282             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
283             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
284             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
285             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
286             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
287             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
288             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
289             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
290
291             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
292             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
293             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
294             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
295             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
296             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
297             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
298             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
299             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
300
301             fjx0             = _mm256_setzero_ps();
302             fjy0             = _mm256_setzero_ps();
303             fjz0             = _mm256_setzero_ps();
304             fjx1             = _mm256_setzero_ps();
305             fjy1             = _mm256_setzero_ps();
306             fjz1             = _mm256_setzero_ps();
307             fjx2             = _mm256_setzero_ps();
308             fjy2             = _mm256_setzero_ps();
309             fjz2             = _mm256_setzero_ps();
310
311             /**************************
312              * CALCULATE INTERACTIONS *
313              **************************/
314
315             /* COULOMB ELECTROSTATICS */
316             velec            = _mm256_mul_ps(qq00,rinv00);
317             felec            = _mm256_mul_ps(velec,rinvsq00);
318
319             /* LENNARD-JONES DISPERSION/REPULSION */
320
321             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
322             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
323             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
324             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
325             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
326
327             /* Update potential sum for this i atom from the interaction with this j atom. */
328             velecsum         = _mm256_add_ps(velecsum,velec);
329             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
330
331             fscal            = _mm256_add_ps(felec,fvdw);
332
333             /* Calculate temporary vectorial force */
334             tx               = _mm256_mul_ps(fscal,dx00);
335             ty               = _mm256_mul_ps(fscal,dy00);
336             tz               = _mm256_mul_ps(fscal,dz00);
337
338             /* Update vectorial force */
339             fix0             = _mm256_add_ps(fix0,tx);
340             fiy0             = _mm256_add_ps(fiy0,ty);
341             fiz0             = _mm256_add_ps(fiz0,tz);
342
343             fjx0             = _mm256_add_ps(fjx0,tx);
344             fjy0             = _mm256_add_ps(fjy0,ty);
345             fjz0             = _mm256_add_ps(fjz0,tz);
346
347             /**************************
348              * CALCULATE INTERACTIONS *
349              **************************/
350
351             /* COULOMB ELECTROSTATICS */
352             velec            = _mm256_mul_ps(qq01,rinv01);
353             felec            = _mm256_mul_ps(velec,rinvsq01);
354
355             /* Update potential sum for this i atom from the interaction with this j atom. */
356             velecsum         = _mm256_add_ps(velecsum,velec);
357
358             fscal            = felec;
359
360             /* Calculate temporary vectorial force */
361             tx               = _mm256_mul_ps(fscal,dx01);
362             ty               = _mm256_mul_ps(fscal,dy01);
363             tz               = _mm256_mul_ps(fscal,dz01);
364
365             /* Update vectorial force */
366             fix0             = _mm256_add_ps(fix0,tx);
367             fiy0             = _mm256_add_ps(fiy0,ty);
368             fiz0             = _mm256_add_ps(fiz0,tz);
369
370             fjx1             = _mm256_add_ps(fjx1,tx);
371             fjy1             = _mm256_add_ps(fjy1,ty);
372             fjz1             = _mm256_add_ps(fjz1,tz);
373
374             /**************************
375              * CALCULATE INTERACTIONS *
376              **************************/
377
378             /* COULOMB ELECTROSTATICS */
379             velec            = _mm256_mul_ps(qq02,rinv02);
380             felec            = _mm256_mul_ps(velec,rinvsq02);
381
382             /* Update potential sum for this i atom from the interaction with this j atom. */
383             velecsum         = _mm256_add_ps(velecsum,velec);
384
385             fscal            = felec;
386
387             /* Calculate temporary vectorial force */
388             tx               = _mm256_mul_ps(fscal,dx02);
389             ty               = _mm256_mul_ps(fscal,dy02);
390             tz               = _mm256_mul_ps(fscal,dz02);
391
392             /* Update vectorial force */
393             fix0             = _mm256_add_ps(fix0,tx);
394             fiy0             = _mm256_add_ps(fiy0,ty);
395             fiz0             = _mm256_add_ps(fiz0,tz);
396
397             fjx2             = _mm256_add_ps(fjx2,tx);
398             fjy2             = _mm256_add_ps(fjy2,ty);
399             fjz2             = _mm256_add_ps(fjz2,tz);
400
401             /**************************
402              * CALCULATE INTERACTIONS *
403              **************************/
404
405             /* COULOMB ELECTROSTATICS */
406             velec            = _mm256_mul_ps(qq10,rinv10);
407             felec            = _mm256_mul_ps(velec,rinvsq10);
408
409             /* Update potential sum for this i atom from the interaction with this j atom. */
410             velecsum         = _mm256_add_ps(velecsum,velec);
411
412             fscal            = felec;
413
414             /* Calculate temporary vectorial force */
415             tx               = _mm256_mul_ps(fscal,dx10);
416             ty               = _mm256_mul_ps(fscal,dy10);
417             tz               = _mm256_mul_ps(fscal,dz10);
418
419             /* Update vectorial force */
420             fix1             = _mm256_add_ps(fix1,tx);
421             fiy1             = _mm256_add_ps(fiy1,ty);
422             fiz1             = _mm256_add_ps(fiz1,tz);
423
424             fjx0             = _mm256_add_ps(fjx0,tx);
425             fjy0             = _mm256_add_ps(fjy0,ty);
426             fjz0             = _mm256_add_ps(fjz0,tz);
427
428             /**************************
429              * CALCULATE INTERACTIONS *
430              **************************/
431
432             /* COULOMB ELECTROSTATICS */
433             velec            = _mm256_mul_ps(qq11,rinv11);
434             felec            = _mm256_mul_ps(velec,rinvsq11);
435
436             /* Update potential sum for this i atom from the interaction with this j atom. */
437             velecsum         = _mm256_add_ps(velecsum,velec);
438
439             fscal            = felec;
440
441             /* Calculate temporary vectorial force */
442             tx               = _mm256_mul_ps(fscal,dx11);
443             ty               = _mm256_mul_ps(fscal,dy11);
444             tz               = _mm256_mul_ps(fscal,dz11);
445
446             /* Update vectorial force */
447             fix1             = _mm256_add_ps(fix1,tx);
448             fiy1             = _mm256_add_ps(fiy1,ty);
449             fiz1             = _mm256_add_ps(fiz1,tz);
450
451             fjx1             = _mm256_add_ps(fjx1,tx);
452             fjy1             = _mm256_add_ps(fjy1,ty);
453             fjz1             = _mm256_add_ps(fjz1,tz);
454
455             /**************************
456              * CALCULATE INTERACTIONS *
457              **************************/
458
459             /* COULOMB ELECTROSTATICS */
460             velec            = _mm256_mul_ps(qq12,rinv12);
461             felec            = _mm256_mul_ps(velec,rinvsq12);
462
463             /* Update potential sum for this i atom from the interaction with this j atom. */
464             velecsum         = _mm256_add_ps(velecsum,velec);
465
466             fscal            = felec;
467
468             /* Calculate temporary vectorial force */
469             tx               = _mm256_mul_ps(fscal,dx12);
470             ty               = _mm256_mul_ps(fscal,dy12);
471             tz               = _mm256_mul_ps(fscal,dz12);
472
473             /* Update vectorial force */
474             fix1             = _mm256_add_ps(fix1,tx);
475             fiy1             = _mm256_add_ps(fiy1,ty);
476             fiz1             = _mm256_add_ps(fiz1,tz);
477
478             fjx2             = _mm256_add_ps(fjx2,tx);
479             fjy2             = _mm256_add_ps(fjy2,ty);
480             fjz2             = _mm256_add_ps(fjz2,tz);
481
482             /**************************
483              * CALCULATE INTERACTIONS *
484              **************************/
485
486             /* COULOMB ELECTROSTATICS */
487             velec            = _mm256_mul_ps(qq20,rinv20);
488             felec            = _mm256_mul_ps(velec,rinvsq20);
489
490             /* Update potential sum for this i atom from the interaction with this j atom. */
491             velecsum         = _mm256_add_ps(velecsum,velec);
492
493             fscal            = felec;
494
495             /* Calculate temporary vectorial force */
496             tx               = _mm256_mul_ps(fscal,dx20);
497             ty               = _mm256_mul_ps(fscal,dy20);
498             tz               = _mm256_mul_ps(fscal,dz20);
499
500             /* Update vectorial force */
501             fix2             = _mm256_add_ps(fix2,tx);
502             fiy2             = _mm256_add_ps(fiy2,ty);
503             fiz2             = _mm256_add_ps(fiz2,tz);
504
505             fjx0             = _mm256_add_ps(fjx0,tx);
506             fjy0             = _mm256_add_ps(fjy0,ty);
507             fjz0             = _mm256_add_ps(fjz0,tz);
508
509             /**************************
510              * CALCULATE INTERACTIONS *
511              **************************/
512
513             /* COULOMB ELECTROSTATICS */
514             velec            = _mm256_mul_ps(qq21,rinv21);
515             felec            = _mm256_mul_ps(velec,rinvsq21);
516
517             /* Update potential sum for this i atom from the interaction with this j atom. */
518             velecsum         = _mm256_add_ps(velecsum,velec);
519
520             fscal            = felec;
521
522             /* Calculate temporary vectorial force */
523             tx               = _mm256_mul_ps(fscal,dx21);
524             ty               = _mm256_mul_ps(fscal,dy21);
525             tz               = _mm256_mul_ps(fscal,dz21);
526
527             /* Update vectorial force */
528             fix2             = _mm256_add_ps(fix2,tx);
529             fiy2             = _mm256_add_ps(fiy2,ty);
530             fiz2             = _mm256_add_ps(fiz2,tz);
531
532             fjx1             = _mm256_add_ps(fjx1,tx);
533             fjy1             = _mm256_add_ps(fjy1,ty);
534             fjz1             = _mm256_add_ps(fjz1,tz);
535
536             /**************************
537              * CALCULATE INTERACTIONS *
538              **************************/
539
540             /* COULOMB ELECTROSTATICS */
541             velec            = _mm256_mul_ps(qq22,rinv22);
542             felec            = _mm256_mul_ps(velec,rinvsq22);
543
544             /* Update potential sum for this i atom from the interaction with this j atom. */
545             velecsum         = _mm256_add_ps(velecsum,velec);
546
547             fscal            = felec;
548
549             /* Calculate temporary vectorial force */
550             tx               = _mm256_mul_ps(fscal,dx22);
551             ty               = _mm256_mul_ps(fscal,dy22);
552             tz               = _mm256_mul_ps(fscal,dz22);
553
554             /* Update vectorial force */
555             fix2             = _mm256_add_ps(fix2,tx);
556             fiy2             = _mm256_add_ps(fiy2,ty);
557             fiz2             = _mm256_add_ps(fiz2,tz);
558
559             fjx2             = _mm256_add_ps(fjx2,tx);
560             fjy2             = _mm256_add_ps(fjy2,ty);
561             fjz2             = _mm256_add_ps(fjz2,tz);
562
563             fjptrA             = f+j_coord_offsetA;
564             fjptrB             = f+j_coord_offsetB;
565             fjptrC             = f+j_coord_offsetC;
566             fjptrD             = f+j_coord_offsetD;
567             fjptrE             = f+j_coord_offsetE;
568             fjptrF             = f+j_coord_offsetF;
569             fjptrG             = f+j_coord_offsetG;
570             fjptrH             = f+j_coord_offsetH;
571
572             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
573                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
574
575             /* Inner loop uses 255 flops */
576         }
577
578         if(jidx<j_index_end)
579         {
580
581             /* Get j neighbor index, and coordinate index */
582             jnrlistA         = jjnr[jidx];
583             jnrlistB         = jjnr[jidx+1];
584             jnrlistC         = jjnr[jidx+2];
585             jnrlistD         = jjnr[jidx+3];
586             jnrlistE         = jjnr[jidx+4];
587             jnrlistF         = jjnr[jidx+5];
588             jnrlistG         = jjnr[jidx+6];
589             jnrlistH         = jjnr[jidx+7];
590             /* Sign of each element will be negative for non-real atoms.
591              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
592              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
593              */
594             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
595                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
596                                             
597             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
598             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
599             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
600             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
601             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
602             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
603             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
604             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
605             j_coord_offsetA  = DIM*jnrA;
606             j_coord_offsetB  = DIM*jnrB;
607             j_coord_offsetC  = DIM*jnrC;
608             j_coord_offsetD  = DIM*jnrD;
609             j_coord_offsetE  = DIM*jnrE;
610             j_coord_offsetF  = DIM*jnrF;
611             j_coord_offsetG  = DIM*jnrG;
612             j_coord_offsetH  = DIM*jnrH;
613
614             /* load j atom coordinates */
615             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
616                                                  x+j_coord_offsetC,x+j_coord_offsetD,
617                                                  x+j_coord_offsetE,x+j_coord_offsetF,
618                                                  x+j_coord_offsetG,x+j_coord_offsetH,
619                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
620
621             /* Calculate displacement vector */
622             dx00             = _mm256_sub_ps(ix0,jx0);
623             dy00             = _mm256_sub_ps(iy0,jy0);
624             dz00             = _mm256_sub_ps(iz0,jz0);
625             dx01             = _mm256_sub_ps(ix0,jx1);
626             dy01             = _mm256_sub_ps(iy0,jy1);
627             dz01             = _mm256_sub_ps(iz0,jz1);
628             dx02             = _mm256_sub_ps(ix0,jx2);
629             dy02             = _mm256_sub_ps(iy0,jy2);
630             dz02             = _mm256_sub_ps(iz0,jz2);
631             dx10             = _mm256_sub_ps(ix1,jx0);
632             dy10             = _mm256_sub_ps(iy1,jy0);
633             dz10             = _mm256_sub_ps(iz1,jz0);
634             dx11             = _mm256_sub_ps(ix1,jx1);
635             dy11             = _mm256_sub_ps(iy1,jy1);
636             dz11             = _mm256_sub_ps(iz1,jz1);
637             dx12             = _mm256_sub_ps(ix1,jx2);
638             dy12             = _mm256_sub_ps(iy1,jy2);
639             dz12             = _mm256_sub_ps(iz1,jz2);
640             dx20             = _mm256_sub_ps(ix2,jx0);
641             dy20             = _mm256_sub_ps(iy2,jy0);
642             dz20             = _mm256_sub_ps(iz2,jz0);
643             dx21             = _mm256_sub_ps(ix2,jx1);
644             dy21             = _mm256_sub_ps(iy2,jy1);
645             dz21             = _mm256_sub_ps(iz2,jz1);
646             dx22             = _mm256_sub_ps(ix2,jx2);
647             dy22             = _mm256_sub_ps(iy2,jy2);
648             dz22             = _mm256_sub_ps(iz2,jz2);
649
650             /* Calculate squared distance and things based on it */
651             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
652             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
653             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
654             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
655             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
656             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
657             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
658             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
659             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
660
661             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
662             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
663             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
664             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
665             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
666             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
667             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
668             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
669             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
670
671             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
672             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
673             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
674             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
675             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
676             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
677             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
678             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
679             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
680
681             fjx0             = _mm256_setzero_ps();
682             fjy0             = _mm256_setzero_ps();
683             fjz0             = _mm256_setzero_ps();
684             fjx1             = _mm256_setzero_ps();
685             fjy1             = _mm256_setzero_ps();
686             fjz1             = _mm256_setzero_ps();
687             fjx2             = _mm256_setzero_ps();
688             fjy2             = _mm256_setzero_ps();
689             fjz2             = _mm256_setzero_ps();
690
691             /**************************
692              * CALCULATE INTERACTIONS *
693              **************************/
694
695             /* COULOMB ELECTROSTATICS */
696             velec            = _mm256_mul_ps(qq00,rinv00);
697             felec            = _mm256_mul_ps(velec,rinvsq00);
698
699             /* LENNARD-JONES DISPERSION/REPULSION */
700
701             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
702             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
703             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
704             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
705             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
706
707             /* Update potential sum for this i atom from the interaction with this j atom. */
708             velec            = _mm256_andnot_ps(dummy_mask,velec);
709             velecsum         = _mm256_add_ps(velecsum,velec);
710             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
711             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
712
713             fscal            = _mm256_add_ps(felec,fvdw);
714
715             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
716
717             /* Calculate temporary vectorial force */
718             tx               = _mm256_mul_ps(fscal,dx00);
719             ty               = _mm256_mul_ps(fscal,dy00);
720             tz               = _mm256_mul_ps(fscal,dz00);
721
722             /* Update vectorial force */
723             fix0             = _mm256_add_ps(fix0,tx);
724             fiy0             = _mm256_add_ps(fiy0,ty);
725             fiz0             = _mm256_add_ps(fiz0,tz);
726
727             fjx0             = _mm256_add_ps(fjx0,tx);
728             fjy0             = _mm256_add_ps(fjy0,ty);
729             fjz0             = _mm256_add_ps(fjz0,tz);
730
731             /**************************
732              * CALCULATE INTERACTIONS *
733              **************************/
734
735             /* COULOMB ELECTROSTATICS */
736             velec            = _mm256_mul_ps(qq01,rinv01);
737             felec            = _mm256_mul_ps(velec,rinvsq01);
738
739             /* Update potential sum for this i atom from the interaction with this j atom. */
740             velec            = _mm256_andnot_ps(dummy_mask,velec);
741             velecsum         = _mm256_add_ps(velecsum,velec);
742
743             fscal            = felec;
744
745             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
746
747             /* Calculate temporary vectorial force */
748             tx               = _mm256_mul_ps(fscal,dx01);
749             ty               = _mm256_mul_ps(fscal,dy01);
750             tz               = _mm256_mul_ps(fscal,dz01);
751
752             /* Update vectorial force */
753             fix0             = _mm256_add_ps(fix0,tx);
754             fiy0             = _mm256_add_ps(fiy0,ty);
755             fiz0             = _mm256_add_ps(fiz0,tz);
756
757             fjx1             = _mm256_add_ps(fjx1,tx);
758             fjy1             = _mm256_add_ps(fjy1,ty);
759             fjz1             = _mm256_add_ps(fjz1,tz);
760
761             /**************************
762              * CALCULATE INTERACTIONS *
763              **************************/
764
765             /* COULOMB ELECTROSTATICS */
766             velec            = _mm256_mul_ps(qq02,rinv02);
767             felec            = _mm256_mul_ps(velec,rinvsq02);
768
769             /* Update potential sum for this i atom from the interaction with this j atom. */
770             velec            = _mm256_andnot_ps(dummy_mask,velec);
771             velecsum         = _mm256_add_ps(velecsum,velec);
772
773             fscal            = felec;
774
775             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
776
777             /* Calculate temporary vectorial force */
778             tx               = _mm256_mul_ps(fscal,dx02);
779             ty               = _mm256_mul_ps(fscal,dy02);
780             tz               = _mm256_mul_ps(fscal,dz02);
781
782             /* Update vectorial force */
783             fix0             = _mm256_add_ps(fix0,tx);
784             fiy0             = _mm256_add_ps(fiy0,ty);
785             fiz0             = _mm256_add_ps(fiz0,tz);
786
787             fjx2             = _mm256_add_ps(fjx2,tx);
788             fjy2             = _mm256_add_ps(fjy2,ty);
789             fjz2             = _mm256_add_ps(fjz2,tz);
790
791             /**************************
792              * CALCULATE INTERACTIONS *
793              **************************/
794
795             /* COULOMB ELECTROSTATICS */
796             velec            = _mm256_mul_ps(qq10,rinv10);
797             felec            = _mm256_mul_ps(velec,rinvsq10);
798
799             /* Update potential sum for this i atom from the interaction with this j atom. */
800             velec            = _mm256_andnot_ps(dummy_mask,velec);
801             velecsum         = _mm256_add_ps(velecsum,velec);
802
803             fscal            = felec;
804
805             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
806
807             /* Calculate temporary vectorial force */
808             tx               = _mm256_mul_ps(fscal,dx10);
809             ty               = _mm256_mul_ps(fscal,dy10);
810             tz               = _mm256_mul_ps(fscal,dz10);
811
812             /* Update vectorial force */
813             fix1             = _mm256_add_ps(fix1,tx);
814             fiy1             = _mm256_add_ps(fiy1,ty);
815             fiz1             = _mm256_add_ps(fiz1,tz);
816
817             fjx0             = _mm256_add_ps(fjx0,tx);
818             fjy0             = _mm256_add_ps(fjy0,ty);
819             fjz0             = _mm256_add_ps(fjz0,tz);
820
821             /**************************
822              * CALCULATE INTERACTIONS *
823              **************************/
824
825             /* COULOMB ELECTROSTATICS */
826             velec            = _mm256_mul_ps(qq11,rinv11);
827             felec            = _mm256_mul_ps(velec,rinvsq11);
828
829             /* Update potential sum for this i atom from the interaction with this j atom. */
830             velec            = _mm256_andnot_ps(dummy_mask,velec);
831             velecsum         = _mm256_add_ps(velecsum,velec);
832
833             fscal            = felec;
834
835             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
836
837             /* Calculate temporary vectorial force */
838             tx               = _mm256_mul_ps(fscal,dx11);
839             ty               = _mm256_mul_ps(fscal,dy11);
840             tz               = _mm256_mul_ps(fscal,dz11);
841
842             /* Update vectorial force */
843             fix1             = _mm256_add_ps(fix1,tx);
844             fiy1             = _mm256_add_ps(fiy1,ty);
845             fiz1             = _mm256_add_ps(fiz1,tz);
846
847             fjx1             = _mm256_add_ps(fjx1,tx);
848             fjy1             = _mm256_add_ps(fjy1,ty);
849             fjz1             = _mm256_add_ps(fjz1,tz);
850
851             /**************************
852              * CALCULATE INTERACTIONS *
853              **************************/
854
855             /* COULOMB ELECTROSTATICS */
856             velec            = _mm256_mul_ps(qq12,rinv12);
857             felec            = _mm256_mul_ps(velec,rinvsq12);
858
859             /* Update potential sum for this i atom from the interaction with this j atom. */
860             velec            = _mm256_andnot_ps(dummy_mask,velec);
861             velecsum         = _mm256_add_ps(velecsum,velec);
862
863             fscal            = felec;
864
865             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
866
867             /* Calculate temporary vectorial force */
868             tx               = _mm256_mul_ps(fscal,dx12);
869             ty               = _mm256_mul_ps(fscal,dy12);
870             tz               = _mm256_mul_ps(fscal,dz12);
871
872             /* Update vectorial force */
873             fix1             = _mm256_add_ps(fix1,tx);
874             fiy1             = _mm256_add_ps(fiy1,ty);
875             fiz1             = _mm256_add_ps(fiz1,tz);
876
877             fjx2             = _mm256_add_ps(fjx2,tx);
878             fjy2             = _mm256_add_ps(fjy2,ty);
879             fjz2             = _mm256_add_ps(fjz2,tz);
880
881             /**************************
882              * CALCULATE INTERACTIONS *
883              **************************/
884
885             /* COULOMB ELECTROSTATICS */
886             velec            = _mm256_mul_ps(qq20,rinv20);
887             felec            = _mm256_mul_ps(velec,rinvsq20);
888
889             /* Update potential sum for this i atom from the interaction with this j atom. */
890             velec            = _mm256_andnot_ps(dummy_mask,velec);
891             velecsum         = _mm256_add_ps(velecsum,velec);
892
893             fscal            = felec;
894
895             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
896
897             /* Calculate temporary vectorial force */
898             tx               = _mm256_mul_ps(fscal,dx20);
899             ty               = _mm256_mul_ps(fscal,dy20);
900             tz               = _mm256_mul_ps(fscal,dz20);
901
902             /* Update vectorial force */
903             fix2             = _mm256_add_ps(fix2,tx);
904             fiy2             = _mm256_add_ps(fiy2,ty);
905             fiz2             = _mm256_add_ps(fiz2,tz);
906
907             fjx0             = _mm256_add_ps(fjx0,tx);
908             fjy0             = _mm256_add_ps(fjy0,ty);
909             fjz0             = _mm256_add_ps(fjz0,tz);
910
911             /**************************
912              * CALCULATE INTERACTIONS *
913              **************************/
914
915             /* COULOMB ELECTROSTATICS */
916             velec            = _mm256_mul_ps(qq21,rinv21);
917             felec            = _mm256_mul_ps(velec,rinvsq21);
918
919             /* Update potential sum for this i atom from the interaction with this j atom. */
920             velec            = _mm256_andnot_ps(dummy_mask,velec);
921             velecsum         = _mm256_add_ps(velecsum,velec);
922
923             fscal            = felec;
924
925             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
926
927             /* Calculate temporary vectorial force */
928             tx               = _mm256_mul_ps(fscal,dx21);
929             ty               = _mm256_mul_ps(fscal,dy21);
930             tz               = _mm256_mul_ps(fscal,dz21);
931
932             /* Update vectorial force */
933             fix2             = _mm256_add_ps(fix2,tx);
934             fiy2             = _mm256_add_ps(fiy2,ty);
935             fiz2             = _mm256_add_ps(fiz2,tz);
936
937             fjx1             = _mm256_add_ps(fjx1,tx);
938             fjy1             = _mm256_add_ps(fjy1,ty);
939             fjz1             = _mm256_add_ps(fjz1,tz);
940
941             /**************************
942              * CALCULATE INTERACTIONS *
943              **************************/
944
945             /* COULOMB ELECTROSTATICS */
946             velec            = _mm256_mul_ps(qq22,rinv22);
947             felec            = _mm256_mul_ps(velec,rinvsq22);
948
949             /* Update potential sum for this i atom from the interaction with this j atom. */
950             velec            = _mm256_andnot_ps(dummy_mask,velec);
951             velecsum         = _mm256_add_ps(velecsum,velec);
952
953             fscal            = felec;
954
955             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
956
957             /* Calculate temporary vectorial force */
958             tx               = _mm256_mul_ps(fscal,dx22);
959             ty               = _mm256_mul_ps(fscal,dy22);
960             tz               = _mm256_mul_ps(fscal,dz22);
961
962             /* Update vectorial force */
963             fix2             = _mm256_add_ps(fix2,tx);
964             fiy2             = _mm256_add_ps(fiy2,ty);
965             fiz2             = _mm256_add_ps(fiz2,tz);
966
967             fjx2             = _mm256_add_ps(fjx2,tx);
968             fjy2             = _mm256_add_ps(fjy2,ty);
969             fjz2             = _mm256_add_ps(fjz2,tz);
970
971             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
972             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
973             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
974             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
975             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
976             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
977             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
978             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
979
980             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
981                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
982
983             /* Inner loop uses 255 flops */
984         }
985
986         /* End of innermost loop */
987
988         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
989                                                  f+i_coord_offset,fshift+i_shift_offset);
990
991         ggid                        = gid[iidx];
992         /* Update potential energies */
993         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
994         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
995
996         /* Increment number of inner iterations */
997         inneriter                  += j_index_end - j_index_start;
998
999         /* Outer loop uses 20 flops */
1000     }
1001
1002     /* Increment number of outer iterations */
1003     outeriter        += nri;
1004
1005     /* Update outer/inner flops */
1006
1007     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*255);
1008 }
1009 /*
1010  * Gromacs nonbonded kernel:   nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_single
1011  * Electrostatics interaction: Coulomb
1012  * VdW interaction:            LennardJones
1013  * Geometry:                   Water3-Water3
1014  * Calculate force/pot:        Force
1015  */
1016 void
1017 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_single
1018                     (t_nblist                    * gmx_restrict       nlist,
1019                      rvec                        * gmx_restrict          xx,
1020                      rvec                        * gmx_restrict          ff,
1021                      t_forcerec                  * gmx_restrict          fr,
1022                      t_mdatoms                   * gmx_restrict     mdatoms,
1023                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1024                      t_nrnb                      * gmx_restrict        nrnb)
1025 {
1026     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
1027      * just 0 for non-waters.
1028      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1029      * jnr indices corresponding to data put in the four positions in the SIMD register.
1030      */
1031     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
1032     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1033     int              jnrA,jnrB,jnrC,jnrD;
1034     int              jnrE,jnrF,jnrG,jnrH;
1035     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1036     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1037     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1038     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1039     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
1040     real             rcutoff_scalar;
1041     real             *shiftvec,*fshift,*x,*f;
1042     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1043     real             scratch[4*DIM];
1044     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1045     real *           vdwioffsetptr0;
1046     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1047     real *           vdwioffsetptr1;
1048     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1049     real *           vdwioffsetptr2;
1050     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1051     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1052     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1053     int              vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1054     __m256           jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1055     int              vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1056     __m256           jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1057     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1058     __m256           dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1059     __m256           dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1060     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1061     __m256           dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1062     __m256           dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1063     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1064     __m256           dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1065     __m256           dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1066     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
1067     real             *charge;
1068     int              nvdwtype;
1069     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1070     int              *vdwtype;
1071     real             *vdwparam;
1072     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
1073     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
1074     __m256           dummy_mask,cutoff_mask;
1075     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1076     __m256           one     = _mm256_set1_ps(1.0);
1077     __m256           two     = _mm256_set1_ps(2.0);
1078     x                = xx[0];
1079     f                = ff[0];
1080
1081     nri              = nlist->nri;
1082     iinr             = nlist->iinr;
1083     jindex           = nlist->jindex;
1084     jjnr             = nlist->jjnr;
1085     shiftidx         = nlist->shift;
1086     gid              = nlist->gid;
1087     shiftvec         = fr->shift_vec[0];
1088     fshift           = fr->fshift[0];
1089     facel            = _mm256_set1_ps(fr->epsfac);
1090     charge           = mdatoms->chargeA;
1091     nvdwtype         = fr->ntype;
1092     vdwparam         = fr->nbfp;
1093     vdwtype          = mdatoms->typeA;
1094
1095     /* Setup water-specific parameters */
1096     inr              = nlist->iinr[0];
1097     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1098     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1099     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1100     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
1101
1102     jq0              = _mm256_set1_ps(charge[inr+0]);
1103     jq1              = _mm256_set1_ps(charge[inr+1]);
1104     jq2              = _mm256_set1_ps(charge[inr+2]);
1105     vdwjidx0A        = 2*vdwtype[inr+0];
1106     qq00             = _mm256_mul_ps(iq0,jq0);
1107     c6_00            = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1108     c12_00           = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1109     qq01             = _mm256_mul_ps(iq0,jq1);
1110     qq02             = _mm256_mul_ps(iq0,jq2);
1111     qq10             = _mm256_mul_ps(iq1,jq0);
1112     qq11             = _mm256_mul_ps(iq1,jq1);
1113     qq12             = _mm256_mul_ps(iq1,jq2);
1114     qq20             = _mm256_mul_ps(iq2,jq0);
1115     qq21             = _mm256_mul_ps(iq2,jq1);
1116     qq22             = _mm256_mul_ps(iq2,jq2);
1117
1118     /* Avoid stupid compiler warnings */
1119     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1120     j_coord_offsetA = 0;
1121     j_coord_offsetB = 0;
1122     j_coord_offsetC = 0;
1123     j_coord_offsetD = 0;
1124     j_coord_offsetE = 0;
1125     j_coord_offsetF = 0;
1126     j_coord_offsetG = 0;
1127     j_coord_offsetH = 0;
1128
1129     outeriter        = 0;
1130     inneriter        = 0;
1131
1132     for(iidx=0;iidx<4*DIM;iidx++)
1133     {
1134         scratch[iidx] = 0.0;
1135     }
1136
1137     /* Start outer loop over neighborlists */
1138     for(iidx=0; iidx<nri; iidx++)
1139     {
1140         /* Load shift vector for this list */
1141         i_shift_offset   = DIM*shiftidx[iidx];
1142
1143         /* Load limits for loop over neighbors */
1144         j_index_start    = jindex[iidx];
1145         j_index_end      = jindex[iidx+1];
1146
1147         /* Get outer coordinate index */
1148         inr              = iinr[iidx];
1149         i_coord_offset   = DIM*inr;
1150
1151         /* Load i particle coords and add shift vector */
1152         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1153                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1154
1155         fix0             = _mm256_setzero_ps();
1156         fiy0             = _mm256_setzero_ps();
1157         fiz0             = _mm256_setzero_ps();
1158         fix1             = _mm256_setzero_ps();
1159         fiy1             = _mm256_setzero_ps();
1160         fiz1             = _mm256_setzero_ps();
1161         fix2             = _mm256_setzero_ps();
1162         fiy2             = _mm256_setzero_ps();
1163         fiz2             = _mm256_setzero_ps();
1164
1165         /* Start inner kernel loop */
1166         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1167         {
1168
1169             /* Get j neighbor index, and coordinate index */
1170             jnrA             = jjnr[jidx];
1171             jnrB             = jjnr[jidx+1];
1172             jnrC             = jjnr[jidx+2];
1173             jnrD             = jjnr[jidx+3];
1174             jnrE             = jjnr[jidx+4];
1175             jnrF             = jjnr[jidx+5];
1176             jnrG             = jjnr[jidx+6];
1177             jnrH             = jjnr[jidx+7];
1178             j_coord_offsetA  = DIM*jnrA;
1179             j_coord_offsetB  = DIM*jnrB;
1180             j_coord_offsetC  = DIM*jnrC;
1181             j_coord_offsetD  = DIM*jnrD;
1182             j_coord_offsetE  = DIM*jnrE;
1183             j_coord_offsetF  = DIM*jnrF;
1184             j_coord_offsetG  = DIM*jnrG;
1185             j_coord_offsetH  = DIM*jnrH;
1186
1187             /* load j atom coordinates */
1188             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1189                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1190                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1191                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1192                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1193
1194             /* Calculate displacement vector */
1195             dx00             = _mm256_sub_ps(ix0,jx0);
1196             dy00             = _mm256_sub_ps(iy0,jy0);
1197             dz00             = _mm256_sub_ps(iz0,jz0);
1198             dx01             = _mm256_sub_ps(ix0,jx1);
1199             dy01             = _mm256_sub_ps(iy0,jy1);
1200             dz01             = _mm256_sub_ps(iz0,jz1);
1201             dx02             = _mm256_sub_ps(ix0,jx2);
1202             dy02             = _mm256_sub_ps(iy0,jy2);
1203             dz02             = _mm256_sub_ps(iz0,jz2);
1204             dx10             = _mm256_sub_ps(ix1,jx0);
1205             dy10             = _mm256_sub_ps(iy1,jy0);
1206             dz10             = _mm256_sub_ps(iz1,jz0);
1207             dx11             = _mm256_sub_ps(ix1,jx1);
1208             dy11             = _mm256_sub_ps(iy1,jy1);
1209             dz11             = _mm256_sub_ps(iz1,jz1);
1210             dx12             = _mm256_sub_ps(ix1,jx2);
1211             dy12             = _mm256_sub_ps(iy1,jy2);
1212             dz12             = _mm256_sub_ps(iz1,jz2);
1213             dx20             = _mm256_sub_ps(ix2,jx0);
1214             dy20             = _mm256_sub_ps(iy2,jy0);
1215             dz20             = _mm256_sub_ps(iz2,jz0);
1216             dx21             = _mm256_sub_ps(ix2,jx1);
1217             dy21             = _mm256_sub_ps(iy2,jy1);
1218             dz21             = _mm256_sub_ps(iz2,jz1);
1219             dx22             = _mm256_sub_ps(ix2,jx2);
1220             dy22             = _mm256_sub_ps(iy2,jy2);
1221             dz22             = _mm256_sub_ps(iz2,jz2);
1222
1223             /* Calculate squared distance and things based on it */
1224             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1225             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1226             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1227             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1228             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1229             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1230             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1231             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1232             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1233
1234             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1235             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1236             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1237             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1238             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1239             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1240             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1241             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1242             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1243
1244             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1245             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1246             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1247             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1248             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1249             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1250             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1251             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1252             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1253
1254             fjx0             = _mm256_setzero_ps();
1255             fjy0             = _mm256_setzero_ps();
1256             fjz0             = _mm256_setzero_ps();
1257             fjx1             = _mm256_setzero_ps();
1258             fjy1             = _mm256_setzero_ps();
1259             fjz1             = _mm256_setzero_ps();
1260             fjx2             = _mm256_setzero_ps();
1261             fjy2             = _mm256_setzero_ps();
1262             fjz2             = _mm256_setzero_ps();
1263
1264             /**************************
1265              * CALCULATE INTERACTIONS *
1266              **************************/
1267
1268             /* COULOMB ELECTROSTATICS */
1269             velec            = _mm256_mul_ps(qq00,rinv00);
1270             felec            = _mm256_mul_ps(velec,rinvsq00);
1271
1272             /* LENNARD-JONES DISPERSION/REPULSION */
1273
1274             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1275             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1276
1277             fscal            = _mm256_add_ps(felec,fvdw);
1278
1279             /* Calculate temporary vectorial force */
1280             tx               = _mm256_mul_ps(fscal,dx00);
1281             ty               = _mm256_mul_ps(fscal,dy00);
1282             tz               = _mm256_mul_ps(fscal,dz00);
1283
1284             /* Update vectorial force */
1285             fix0             = _mm256_add_ps(fix0,tx);
1286             fiy0             = _mm256_add_ps(fiy0,ty);
1287             fiz0             = _mm256_add_ps(fiz0,tz);
1288
1289             fjx0             = _mm256_add_ps(fjx0,tx);
1290             fjy0             = _mm256_add_ps(fjy0,ty);
1291             fjz0             = _mm256_add_ps(fjz0,tz);
1292
1293             /**************************
1294              * CALCULATE INTERACTIONS *
1295              **************************/
1296
1297             /* COULOMB ELECTROSTATICS */
1298             velec            = _mm256_mul_ps(qq01,rinv01);
1299             felec            = _mm256_mul_ps(velec,rinvsq01);
1300
1301             fscal            = felec;
1302
1303             /* Calculate temporary vectorial force */
1304             tx               = _mm256_mul_ps(fscal,dx01);
1305             ty               = _mm256_mul_ps(fscal,dy01);
1306             tz               = _mm256_mul_ps(fscal,dz01);
1307
1308             /* Update vectorial force */
1309             fix0             = _mm256_add_ps(fix0,tx);
1310             fiy0             = _mm256_add_ps(fiy0,ty);
1311             fiz0             = _mm256_add_ps(fiz0,tz);
1312
1313             fjx1             = _mm256_add_ps(fjx1,tx);
1314             fjy1             = _mm256_add_ps(fjy1,ty);
1315             fjz1             = _mm256_add_ps(fjz1,tz);
1316
1317             /**************************
1318              * CALCULATE INTERACTIONS *
1319              **************************/
1320
1321             /* COULOMB ELECTROSTATICS */
1322             velec            = _mm256_mul_ps(qq02,rinv02);
1323             felec            = _mm256_mul_ps(velec,rinvsq02);
1324
1325             fscal            = felec;
1326
1327             /* Calculate temporary vectorial force */
1328             tx               = _mm256_mul_ps(fscal,dx02);
1329             ty               = _mm256_mul_ps(fscal,dy02);
1330             tz               = _mm256_mul_ps(fscal,dz02);
1331
1332             /* Update vectorial force */
1333             fix0             = _mm256_add_ps(fix0,tx);
1334             fiy0             = _mm256_add_ps(fiy0,ty);
1335             fiz0             = _mm256_add_ps(fiz0,tz);
1336
1337             fjx2             = _mm256_add_ps(fjx2,tx);
1338             fjy2             = _mm256_add_ps(fjy2,ty);
1339             fjz2             = _mm256_add_ps(fjz2,tz);
1340
1341             /**************************
1342              * CALCULATE INTERACTIONS *
1343              **************************/
1344
1345             /* COULOMB ELECTROSTATICS */
1346             velec            = _mm256_mul_ps(qq10,rinv10);
1347             felec            = _mm256_mul_ps(velec,rinvsq10);
1348
1349             fscal            = felec;
1350
1351             /* Calculate temporary vectorial force */
1352             tx               = _mm256_mul_ps(fscal,dx10);
1353             ty               = _mm256_mul_ps(fscal,dy10);
1354             tz               = _mm256_mul_ps(fscal,dz10);
1355
1356             /* Update vectorial force */
1357             fix1             = _mm256_add_ps(fix1,tx);
1358             fiy1             = _mm256_add_ps(fiy1,ty);
1359             fiz1             = _mm256_add_ps(fiz1,tz);
1360
1361             fjx0             = _mm256_add_ps(fjx0,tx);
1362             fjy0             = _mm256_add_ps(fjy0,ty);
1363             fjz0             = _mm256_add_ps(fjz0,tz);
1364
1365             /**************************
1366              * CALCULATE INTERACTIONS *
1367              **************************/
1368
1369             /* COULOMB ELECTROSTATICS */
1370             velec            = _mm256_mul_ps(qq11,rinv11);
1371             felec            = _mm256_mul_ps(velec,rinvsq11);
1372
1373             fscal            = felec;
1374
1375             /* Calculate temporary vectorial force */
1376             tx               = _mm256_mul_ps(fscal,dx11);
1377             ty               = _mm256_mul_ps(fscal,dy11);
1378             tz               = _mm256_mul_ps(fscal,dz11);
1379
1380             /* Update vectorial force */
1381             fix1             = _mm256_add_ps(fix1,tx);
1382             fiy1             = _mm256_add_ps(fiy1,ty);
1383             fiz1             = _mm256_add_ps(fiz1,tz);
1384
1385             fjx1             = _mm256_add_ps(fjx1,tx);
1386             fjy1             = _mm256_add_ps(fjy1,ty);
1387             fjz1             = _mm256_add_ps(fjz1,tz);
1388
1389             /**************************
1390              * CALCULATE INTERACTIONS *
1391              **************************/
1392
1393             /* COULOMB ELECTROSTATICS */
1394             velec            = _mm256_mul_ps(qq12,rinv12);
1395             felec            = _mm256_mul_ps(velec,rinvsq12);
1396
1397             fscal            = felec;
1398
1399             /* Calculate temporary vectorial force */
1400             tx               = _mm256_mul_ps(fscal,dx12);
1401             ty               = _mm256_mul_ps(fscal,dy12);
1402             tz               = _mm256_mul_ps(fscal,dz12);
1403
1404             /* Update vectorial force */
1405             fix1             = _mm256_add_ps(fix1,tx);
1406             fiy1             = _mm256_add_ps(fiy1,ty);
1407             fiz1             = _mm256_add_ps(fiz1,tz);
1408
1409             fjx2             = _mm256_add_ps(fjx2,tx);
1410             fjy2             = _mm256_add_ps(fjy2,ty);
1411             fjz2             = _mm256_add_ps(fjz2,tz);
1412
1413             /**************************
1414              * CALCULATE INTERACTIONS *
1415              **************************/
1416
1417             /* COULOMB ELECTROSTATICS */
1418             velec            = _mm256_mul_ps(qq20,rinv20);
1419             felec            = _mm256_mul_ps(velec,rinvsq20);
1420
1421             fscal            = felec;
1422
1423             /* Calculate temporary vectorial force */
1424             tx               = _mm256_mul_ps(fscal,dx20);
1425             ty               = _mm256_mul_ps(fscal,dy20);
1426             tz               = _mm256_mul_ps(fscal,dz20);
1427
1428             /* Update vectorial force */
1429             fix2             = _mm256_add_ps(fix2,tx);
1430             fiy2             = _mm256_add_ps(fiy2,ty);
1431             fiz2             = _mm256_add_ps(fiz2,tz);
1432
1433             fjx0             = _mm256_add_ps(fjx0,tx);
1434             fjy0             = _mm256_add_ps(fjy0,ty);
1435             fjz0             = _mm256_add_ps(fjz0,tz);
1436
1437             /**************************
1438              * CALCULATE INTERACTIONS *
1439              **************************/
1440
1441             /* COULOMB ELECTROSTATICS */
1442             velec            = _mm256_mul_ps(qq21,rinv21);
1443             felec            = _mm256_mul_ps(velec,rinvsq21);
1444
1445             fscal            = felec;
1446
1447             /* Calculate temporary vectorial force */
1448             tx               = _mm256_mul_ps(fscal,dx21);
1449             ty               = _mm256_mul_ps(fscal,dy21);
1450             tz               = _mm256_mul_ps(fscal,dz21);
1451
1452             /* Update vectorial force */
1453             fix2             = _mm256_add_ps(fix2,tx);
1454             fiy2             = _mm256_add_ps(fiy2,ty);
1455             fiz2             = _mm256_add_ps(fiz2,tz);
1456
1457             fjx1             = _mm256_add_ps(fjx1,tx);
1458             fjy1             = _mm256_add_ps(fjy1,ty);
1459             fjz1             = _mm256_add_ps(fjz1,tz);
1460
1461             /**************************
1462              * CALCULATE INTERACTIONS *
1463              **************************/
1464
1465             /* COULOMB ELECTROSTATICS */
1466             velec            = _mm256_mul_ps(qq22,rinv22);
1467             felec            = _mm256_mul_ps(velec,rinvsq22);
1468
1469             fscal            = felec;
1470
1471             /* Calculate temporary vectorial force */
1472             tx               = _mm256_mul_ps(fscal,dx22);
1473             ty               = _mm256_mul_ps(fscal,dy22);
1474             tz               = _mm256_mul_ps(fscal,dz22);
1475
1476             /* Update vectorial force */
1477             fix2             = _mm256_add_ps(fix2,tx);
1478             fiy2             = _mm256_add_ps(fiy2,ty);
1479             fiz2             = _mm256_add_ps(fiz2,tz);
1480
1481             fjx2             = _mm256_add_ps(fjx2,tx);
1482             fjy2             = _mm256_add_ps(fjy2,ty);
1483             fjz2             = _mm256_add_ps(fjz2,tz);
1484
1485             fjptrA             = f+j_coord_offsetA;
1486             fjptrB             = f+j_coord_offsetB;
1487             fjptrC             = f+j_coord_offsetC;
1488             fjptrD             = f+j_coord_offsetD;
1489             fjptrE             = f+j_coord_offsetE;
1490             fjptrF             = f+j_coord_offsetF;
1491             fjptrG             = f+j_coord_offsetG;
1492             fjptrH             = f+j_coord_offsetH;
1493
1494             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1495                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1496
1497             /* Inner loop uses 241 flops */
1498         }
1499
1500         if(jidx<j_index_end)
1501         {
1502
1503             /* Get j neighbor index, and coordinate index */
1504             jnrlistA         = jjnr[jidx];
1505             jnrlistB         = jjnr[jidx+1];
1506             jnrlistC         = jjnr[jidx+2];
1507             jnrlistD         = jjnr[jidx+3];
1508             jnrlistE         = jjnr[jidx+4];
1509             jnrlistF         = jjnr[jidx+5];
1510             jnrlistG         = jjnr[jidx+6];
1511             jnrlistH         = jjnr[jidx+7];
1512             /* Sign of each element will be negative for non-real atoms.
1513              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1514              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1515              */
1516             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1517                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1518                                             
1519             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1520             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1521             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1522             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1523             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1524             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1525             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1526             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1527             j_coord_offsetA  = DIM*jnrA;
1528             j_coord_offsetB  = DIM*jnrB;
1529             j_coord_offsetC  = DIM*jnrC;
1530             j_coord_offsetD  = DIM*jnrD;
1531             j_coord_offsetE  = DIM*jnrE;
1532             j_coord_offsetF  = DIM*jnrF;
1533             j_coord_offsetG  = DIM*jnrG;
1534             j_coord_offsetH  = DIM*jnrH;
1535
1536             /* load j atom coordinates */
1537             gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1538                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1539                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1540                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1541                                               &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1542
1543             /* Calculate displacement vector */
1544             dx00             = _mm256_sub_ps(ix0,jx0);
1545             dy00             = _mm256_sub_ps(iy0,jy0);
1546             dz00             = _mm256_sub_ps(iz0,jz0);
1547             dx01             = _mm256_sub_ps(ix0,jx1);
1548             dy01             = _mm256_sub_ps(iy0,jy1);
1549             dz01             = _mm256_sub_ps(iz0,jz1);
1550             dx02             = _mm256_sub_ps(ix0,jx2);
1551             dy02             = _mm256_sub_ps(iy0,jy2);
1552             dz02             = _mm256_sub_ps(iz0,jz2);
1553             dx10             = _mm256_sub_ps(ix1,jx0);
1554             dy10             = _mm256_sub_ps(iy1,jy0);
1555             dz10             = _mm256_sub_ps(iz1,jz0);
1556             dx11             = _mm256_sub_ps(ix1,jx1);
1557             dy11             = _mm256_sub_ps(iy1,jy1);
1558             dz11             = _mm256_sub_ps(iz1,jz1);
1559             dx12             = _mm256_sub_ps(ix1,jx2);
1560             dy12             = _mm256_sub_ps(iy1,jy2);
1561             dz12             = _mm256_sub_ps(iz1,jz2);
1562             dx20             = _mm256_sub_ps(ix2,jx0);
1563             dy20             = _mm256_sub_ps(iy2,jy0);
1564             dz20             = _mm256_sub_ps(iz2,jz0);
1565             dx21             = _mm256_sub_ps(ix2,jx1);
1566             dy21             = _mm256_sub_ps(iy2,jy1);
1567             dz21             = _mm256_sub_ps(iz2,jz1);
1568             dx22             = _mm256_sub_ps(ix2,jx2);
1569             dy22             = _mm256_sub_ps(iy2,jy2);
1570             dz22             = _mm256_sub_ps(iz2,jz2);
1571
1572             /* Calculate squared distance and things based on it */
1573             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1574             rsq01            = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1575             rsq02            = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1576             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1577             rsq11            = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1578             rsq12            = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1579             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1580             rsq21            = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1581             rsq22            = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1582
1583             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1584             rinv01           = gmx_mm256_invsqrt_ps(rsq01);
1585             rinv02           = gmx_mm256_invsqrt_ps(rsq02);
1586             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1587             rinv11           = gmx_mm256_invsqrt_ps(rsq11);
1588             rinv12           = gmx_mm256_invsqrt_ps(rsq12);
1589             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1590             rinv21           = gmx_mm256_invsqrt_ps(rsq21);
1591             rinv22           = gmx_mm256_invsqrt_ps(rsq22);
1592
1593             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1594             rinvsq01         = _mm256_mul_ps(rinv01,rinv01);
1595             rinvsq02         = _mm256_mul_ps(rinv02,rinv02);
1596             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1597             rinvsq11         = _mm256_mul_ps(rinv11,rinv11);
1598             rinvsq12         = _mm256_mul_ps(rinv12,rinv12);
1599             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1600             rinvsq21         = _mm256_mul_ps(rinv21,rinv21);
1601             rinvsq22         = _mm256_mul_ps(rinv22,rinv22);
1602
1603             fjx0             = _mm256_setzero_ps();
1604             fjy0             = _mm256_setzero_ps();
1605             fjz0             = _mm256_setzero_ps();
1606             fjx1             = _mm256_setzero_ps();
1607             fjy1             = _mm256_setzero_ps();
1608             fjz1             = _mm256_setzero_ps();
1609             fjx2             = _mm256_setzero_ps();
1610             fjy2             = _mm256_setzero_ps();
1611             fjz2             = _mm256_setzero_ps();
1612
1613             /**************************
1614              * CALCULATE INTERACTIONS *
1615              **************************/
1616
1617             /* COULOMB ELECTROSTATICS */
1618             velec            = _mm256_mul_ps(qq00,rinv00);
1619             felec            = _mm256_mul_ps(velec,rinvsq00);
1620
1621             /* LENNARD-JONES DISPERSION/REPULSION */
1622
1623             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1624             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1625
1626             fscal            = _mm256_add_ps(felec,fvdw);
1627
1628             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1629
1630             /* Calculate temporary vectorial force */
1631             tx               = _mm256_mul_ps(fscal,dx00);
1632             ty               = _mm256_mul_ps(fscal,dy00);
1633             tz               = _mm256_mul_ps(fscal,dz00);
1634
1635             /* Update vectorial force */
1636             fix0             = _mm256_add_ps(fix0,tx);
1637             fiy0             = _mm256_add_ps(fiy0,ty);
1638             fiz0             = _mm256_add_ps(fiz0,tz);
1639
1640             fjx0             = _mm256_add_ps(fjx0,tx);
1641             fjy0             = _mm256_add_ps(fjy0,ty);
1642             fjz0             = _mm256_add_ps(fjz0,tz);
1643
1644             /**************************
1645              * CALCULATE INTERACTIONS *
1646              **************************/
1647
1648             /* COULOMB ELECTROSTATICS */
1649             velec            = _mm256_mul_ps(qq01,rinv01);
1650             felec            = _mm256_mul_ps(velec,rinvsq01);
1651
1652             fscal            = felec;
1653
1654             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1655
1656             /* Calculate temporary vectorial force */
1657             tx               = _mm256_mul_ps(fscal,dx01);
1658             ty               = _mm256_mul_ps(fscal,dy01);
1659             tz               = _mm256_mul_ps(fscal,dz01);
1660
1661             /* Update vectorial force */
1662             fix0             = _mm256_add_ps(fix0,tx);
1663             fiy0             = _mm256_add_ps(fiy0,ty);
1664             fiz0             = _mm256_add_ps(fiz0,tz);
1665
1666             fjx1             = _mm256_add_ps(fjx1,tx);
1667             fjy1             = _mm256_add_ps(fjy1,ty);
1668             fjz1             = _mm256_add_ps(fjz1,tz);
1669
1670             /**************************
1671              * CALCULATE INTERACTIONS *
1672              **************************/
1673
1674             /* COULOMB ELECTROSTATICS */
1675             velec            = _mm256_mul_ps(qq02,rinv02);
1676             felec            = _mm256_mul_ps(velec,rinvsq02);
1677
1678             fscal            = felec;
1679
1680             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1681
1682             /* Calculate temporary vectorial force */
1683             tx               = _mm256_mul_ps(fscal,dx02);
1684             ty               = _mm256_mul_ps(fscal,dy02);
1685             tz               = _mm256_mul_ps(fscal,dz02);
1686
1687             /* Update vectorial force */
1688             fix0             = _mm256_add_ps(fix0,tx);
1689             fiy0             = _mm256_add_ps(fiy0,ty);
1690             fiz0             = _mm256_add_ps(fiz0,tz);
1691
1692             fjx2             = _mm256_add_ps(fjx2,tx);
1693             fjy2             = _mm256_add_ps(fjy2,ty);
1694             fjz2             = _mm256_add_ps(fjz2,tz);
1695
1696             /**************************
1697              * CALCULATE INTERACTIONS *
1698              **************************/
1699
1700             /* COULOMB ELECTROSTATICS */
1701             velec            = _mm256_mul_ps(qq10,rinv10);
1702             felec            = _mm256_mul_ps(velec,rinvsq10);
1703
1704             fscal            = felec;
1705
1706             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1707
1708             /* Calculate temporary vectorial force */
1709             tx               = _mm256_mul_ps(fscal,dx10);
1710             ty               = _mm256_mul_ps(fscal,dy10);
1711             tz               = _mm256_mul_ps(fscal,dz10);
1712
1713             /* Update vectorial force */
1714             fix1             = _mm256_add_ps(fix1,tx);
1715             fiy1             = _mm256_add_ps(fiy1,ty);
1716             fiz1             = _mm256_add_ps(fiz1,tz);
1717
1718             fjx0             = _mm256_add_ps(fjx0,tx);
1719             fjy0             = _mm256_add_ps(fjy0,ty);
1720             fjz0             = _mm256_add_ps(fjz0,tz);
1721
1722             /**************************
1723              * CALCULATE INTERACTIONS *
1724              **************************/
1725
1726             /* COULOMB ELECTROSTATICS */
1727             velec            = _mm256_mul_ps(qq11,rinv11);
1728             felec            = _mm256_mul_ps(velec,rinvsq11);
1729
1730             fscal            = felec;
1731
1732             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1733
1734             /* Calculate temporary vectorial force */
1735             tx               = _mm256_mul_ps(fscal,dx11);
1736             ty               = _mm256_mul_ps(fscal,dy11);
1737             tz               = _mm256_mul_ps(fscal,dz11);
1738
1739             /* Update vectorial force */
1740             fix1             = _mm256_add_ps(fix1,tx);
1741             fiy1             = _mm256_add_ps(fiy1,ty);
1742             fiz1             = _mm256_add_ps(fiz1,tz);
1743
1744             fjx1             = _mm256_add_ps(fjx1,tx);
1745             fjy1             = _mm256_add_ps(fjy1,ty);
1746             fjz1             = _mm256_add_ps(fjz1,tz);
1747
1748             /**************************
1749              * CALCULATE INTERACTIONS *
1750              **************************/
1751
1752             /* COULOMB ELECTROSTATICS */
1753             velec            = _mm256_mul_ps(qq12,rinv12);
1754             felec            = _mm256_mul_ps(velec,rinvsq12);
1755
1756             fscal            = felec;
1757
1758             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1759
1760             /* Calculate temporary vectorial force */
1761             tx               = _mm256_mul_ps(fscal,dx12);
1762             ty               = _mm256_mul_ps(fscal,dy12);
1763             tz               = _mm256_mul_ps(fscal,dz12);
1764
1765             /* Update vectorial force */
1766             fix1             = _mm256_add_ps(fix1,tx);
1767             fiy1             = _mm256_add_ps(fiy1,ty);
1768             fiz1             = _mm256_add_ps(fiz1,tz);
1769
1770             fjx2             = _mm256_add_ps(fjx2,tx);
1771             fjy2             = _mm256_add_ps(fjy2,ty);
1772             fjz2             = _mm256_add_ps(fjz2,tz);
1773
1774             /**************************
1775              * CALCULATE INTERACTIONS *
1776              **************************/
1777
1778             /* COULOMB ELECTROSTATICS */
1779             velec            = _mm256_mul_ps(qq20,rinv20);
1780             felec            = _mm256_mul_ps(velec,rinvsq20);
1781
1782             fscal            = felec;
1783
1784             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1785
1786             /* Calculate temporary vectorial force */
1787             tx               = _mm256_mul_ps(fscal,dx20);
1788             ty               = _mm256_mul_ps(fscal,dy20);
1789             tz               = _mm256_mul_ps(fscal,dz20);
1790
1791             /* Update vectorial force */
1792             fix2             = _mm256_add_ps(fix2,tx);
1793             fiy2             = _mm256_add_ps(fiy2,ty);
1794             fiz2             = _mm256_add_ps(fiz2,tz);
1795
1796             fjx0             = _mm256_add_ps(fjx0,tx);
1797             fjy0             = _mm256_add_ps(fjy0,ty);
1798             fjz0             = _mm256_add_ps(fjz0,tz);
1799
1800             /**************************
1801              * CALCULATE INTERACTIONS *
1802              **************************/
1803
1804             /* COULOMB ELECTROSTATICS */
1805             velec            = _mm256_mul_ps(qq21,rinv21);
1806             felec            = _mm256_mul_ps(velec,rinvsq21);
1807
1808             fscal            = felec;
1809
1810             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1811
1812             /* Calculate temporary vectorial force */
1813             tx               = _mm256_mul_ps(fscal,dx21);
1814             ty               = _mm256_mul_ps(fscal,dy21);
1815             tz               = _mm256_mul_ps(fscal,dz21);
1816
1817             /* Update vectorial force */
1818             fix2             = _mm256_add_ps(fix2,tx);
1819             fiy2             = _mm256_add_ps(fiy2,ty);
1820             fiz2             = _mm256_add_ps(fiz2,tz);
1821
1822             fjx1             = _mm256_add_ps(fjx1,tx);
1823             fjy1             = _mm256_add_ps(fjy1,ty);
1824             fjz1             = _mm256_add_ps(fjz1,tz);
1825
1826             /**************************
1827              * CALCULATE INTERACTIONS *
1828              **************************/
1829
1830             /* COULOMB ELECTROSTATICS */
1831             velec            = _mm256_mul_ps(qq22,rinv22);
1832             felec            = _mm256_mul_ps(velec,rinvsq22);
1833
1834             fscal            = felec;
1835
1836             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1837
1838             /* Calculate temporary vectorial force */
1839             tx               = _mm256_mul_ps(fscal,dx22);
1840             ty               = _mm256_mul_ps(fscal,dy22);
1841             tz               = _mm256_mul_ps(fscal,dz22);
1842
1843             /* Update vectorial force */
1844             fix2             = _mm256_add_ps(fix2,tx);
1845             fiy2             = _mm256_add_ps(fiy2,ty);
1846             fiz2             = _mm256_add_ps(fiz2,tz);
1847
1848             fjx2             = _mm256_add_ps(fjx2,tx);
1849             fjy2             = _mm256_add_ps(fjy2,ty);
1850             fjz2             = _mm256_add_ps(fjz2,tz);
1851
1852             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1853             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1854             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1855             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1856             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1857             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1858             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1859             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1860
1861             gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1862                                                       fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1863
1864             /* Inner loop uses 241 flops */
1865         }
1866
1867         /* End of innermost loop */
1868
1869         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1870                                                  f+i_coord_offset,fshift+i_shift_offset);
1871
1872         /* Increment number of inner iterations */
1873         inneriter                  += j_index_end - j_index_start;
1874
1875         /* Outer loop uses 18 flops */
1876     }
1877
1878     /* Increment number of outer iterations */
1879     outeriter        += nri;
1880
1881     /* Update outer/inner flops */
1882
1883     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*241);
1884 }