Remove no-inline-max-size and suppress remark
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecEw_VdwLJ_GeomW3P1_avx_256_single.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #ifdef HAVE_CONFIG_H
39 #include <config.h>
40 #endif
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
46 #include "vec.h"
47 #include "nrnb.h"
48
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
51
52 /*
53  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_single
54  * Electrostatics interaction: Ewald
55  * VdW interaction:            LennardJones
56  * Geometry:                   Water3-Particle
57  * Calculate force/pot:        PotentialAndForce
58  */
59 void
60 nb_kernel_ElecEw_VdwLJ_GeomW3P1_VF_avx_256_single
61                     (t_nblist                    * gmx_restrict       nlist,
62                      rvec                        * gmx_restrict          xx,
63                      rvec                        * gmx_restrict          ff,
64                      t_forcerec                  * gmx_restrict          fr,
65                      t_mdatoms                   * gmx_restrict     mdatoms,
66                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67                      t_nrnb                      * gmx_restrict        nrnb)
68 {
69     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
70      * just 0 for non-waters.
71      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72      * jnr indices corresponding to data put in the four positions in the SIMD register.
73      */
74     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
75     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76     int              jnrA,jnrB,jnrC,jnrD;
77     int              jnrE,jnrF,jnrG,jnrH;
78     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
83     real             rcutoff_scalar;
84     real             *shiftvec,*fshift,*x,*f;
85     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
86     real             scratch[4*DIM];
87     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88     real *           vdwioffsetptr0;
89     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90     real *           vdwioffsetptr1;
91     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92     real *           vdwioffsetptr2;
93     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
97     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
98     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
99     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
100     real             *charge;
101     int              nvdwtype;
102     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
103     int              *vdwtype;
104     real             *vdwparam;
105     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
106     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
107     __m256i          ewitab;
108     __m128i          ewitab_lo,ewitab_hi;
109     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
110     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
111     real             *ewtab;
112     __m256           dummy_mask,cutoff_mask;
113     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
114     __m256           one     = _mm256_set1_ps(1.0);
115     __m256           two     = _mm256_set1_ps(2.0);
116     x                = xx[0];
117     f                = ff[0];
118
119     nri              = nlist->nri;
120     iinr             = nlist->iinr;
121     jindex           = nlist->jindex;
122     jjnr             = nlist->jjnr;
123     shiftidx         = nlist->shift;
124     gid              = nlist->gid;
125     shiftvec         = fr->shift_vec[0];
126     fshift           = fr->fshift[0];
127     facel            = _mm256_set1_ps(fr->epsfac);
128     charge           = mdatoms->chargeA;
129     nvdwtype         = fr->ntype;
130     vdwparam         = fr->nbfp;
131     vdwtype          = mdatoms->typeA;
132
133     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
134     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
135     beta2            = _mm256_mul_ps(beta,beta);
136     beta3            = _mm256_mul_ps(beta,beta2);
137
138     ewtab            = fr->ic->tabq_coul_FDV0;
139     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
140     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
141
142     /* Setup water-specific parameters */
143     inr              = nlist->iinr[0];
144     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
145     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
146     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
147     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
148
149     /* Avoid stupid compiler warnings */
150     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
151     j_coord_offsetA = 0;
152     j_coord_offsetB = 0;
153     j_coord_offsetC = 0;
154     j_coord_offsetD = 0;
155     j_coord_offsetE = 0;
156     j_coord_offsetF = 0;
157     j_coord_offsetG = 0;
158     j_coord_offsetH = 0;
159
160     outeriter        = 0;
161     inneriter        = 0;
162
163     for(iidx=0;iidx<4*DIM;iidx++)
164     {
165         scratch[iidx] = 0.0;
166     }
167
168     /* Start outer loop over neighborlists */
169     for(iidx=0; iidx<nri; iidx++)
170     {
171         /* Load shift vector for this list */
172         i_shift_offset   = DIM*shiftidx[iidx];
173
174         /* Load limits for loop over neighbors */
175         j_index_start    = jindex[iidx];
176         j_index_end      = jindex[iidx+1];
177
178         /* Get outer coordinate index */
179         inr              = iinr[iidx];
180         i_coord_offset   = DIM*inr;
181
182         /* Load i particle coords and add shift vector */
183         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
184                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
185
186         fix0             = _mm256_setzero_ps();
187         fiy0             = _mm256_setzero_ps();
188         fiz0             = _mm256_setzero_ps();
189         fix1             = _mm256_setzero_ps();
190         fiy1             = _mm256_setzero_ps();
191         fiz1             = _mm256_setzero_ps();
192         fix2             = _mm256_setzero_ps();
193         fiy2             = _mm256_setzero_ps();
194         fiz2             = _mm256_setzero_ps();
195
196         /* Reset potential sums */
197         velecsum         = _mm256_setzero_ps();
198         vvdwsum          = _mm256_setzero_ps();
199
200         /* Start inner kernel loop */
201         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
202         {
203
204             /* Get j neighbor index, and coordinate index */
205             jnrA             = jjnr[jidx];
206             jnrB             = jjnr[jidx+1];
207             jnrC             = jjnr[jidx+2];
208             jnrD             = jjnr[jidx+3];
209             jnrE             = jjnr[jidx+4];
210             jnrF             = jjnr[jidx+5];
211             jnrG             = jjnr[jidx+6];
212             jnrH             = jjnr[jidx+7];
213             j_coord_offsetA  = DIM*jnrA;
214             j_coord_offsetB  = DIM*jnrB;
215             j_coord_offsetC  = DIM*jnrC;
216             j_coord_offsetD  = DIM*jnrD;
217             j_coord_offsetE  = DIM*jnrE;
218             j_coord_offsetF  = DIM*jnrF;
219             j_coord_offsetG  = DIM*jnrG;
220             j_coord_offsetH  = DIM*jnrH;
221
222             /* load j atom coordinates */
223             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
224                                                  x+j_coord_offsetC,x+j_coord_offsetD,
225                                                  x+j_coord_offsetE,x+j_coord_offsetF,
226                                                  x+j_coord_offsetG,x+j_coord_offsetH,
227                                                  &jx0,&jy0,&jz0);
228
229             /* Calculate displacement vector */
230             dx00             = _mm256_sub_ps(ix0,jx0);
231             dy00             = _mm256_sub_ps(iy0,jy0);
232             dz00             = _mm256_sub_ps(iz0,jz0);
233             dx10             = _mm256_sub_ps(ix1,jx0);
234             dy10             = _mm256_sub_ps(iy1,jy0);
235             dz10             = _mm256_sub_ps(iz1,jz0);
236             dx20             = _mm256_sub_ps(ix2,jx0);
237             dy20             = _mm256_sub_ps(iy2,jy0);
238             dz20             = _mm256_sub_ps(iz2,jz0);
239
240             /* Calculate squared distance and things based on it */
241             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
242             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
243             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
244
245             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
246             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
247             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
248
249             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
250             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
251             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
252
253             /* Load parameters for j particles */
254             jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
255                                                                  charge+jnrC+0,charge+jnrD+0,
256                                                                  charge+jnrE+0,charge+jnrF+0,
257                                                                  charge+jnrG+0,charge+jnrH+0);
258             vdwjidx0A        = 2*vdwtype[jnrA+0];
259             vdwjidx0B        = 2*vdwtype[jnrB+0];
260             vdwjidx0C        = 2*vdwtype[jnrC+0];
261             vdwjidx0D        = 2*vdwtype[jnrD+0];
262             vdwjidx0E        = 2*vdwtype[jnrE+0];
263             vdwjidx0F        = 2*vdwtype[jnrF+0];
264             vdwjidx0G        = 2*vdwtype[jnrG+0];
265             vdwjidx0H        = 2*vdwtype[jnrH+0];
266
267             fjx0             = _mm256_setzero_ps();
268             fjy0             = _mm256_setzero_ps();
269             fjz0             = _mm256_setzero_ps();
270
271             /**************************
272              * CALCULATE INTERACTIONS *
273              **************************/
274
275             r00              = _mm256_mul_ps(rsq00,rinv00);
276
277             /* Compute parameters for interactions between i and j atoms */
278             qq00             = _mm256_mul_ps(iq0,jq0);
279             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
280                                             vdwioffsetptr0+vdwjidx0B,
281                                             vdwioffsetptr0+vdwjidx0C,
282                                             vdwioffsetptr0+vdwjidx0D,
283                                             vdwioffsetptr0+vdwjidx0E,
284                                             vdwioffsetptr0+vdwjidx0F,
285                                             vdwioffsetptr0+vdwjidx0G,
286                                             vdwioffsetptr0+vdwjidx0H,
287                                             &c6_00,&c12_00);
288
289             /* EWALD ELECTROSTATICS */
290             
291             /* Analytical PME correction */
292             zeta2            = _mm256_mul_ps(beta2,rsq00);
293             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
294             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
295             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
296             felec            = _mm256_mul_ps(qq00,felec);
297             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
298             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
299             velec            = _mm256_sub_ps(rinv00,pmecorrV);
300             velec            = _mm256_mul_ps(qq00,velec);
301             
302             /* LENNARD-JONES DISPERSION/REPULSION */
303
304             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
305             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
306             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
307             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
308             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
309
310             /* Update potential sum for this i atom from the interaction with this j atom. */
311             velecsum         = _mm256_add_ps(velecsum,velec);
312             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
313
314             fscal            = _mm256_add_ps(felec,fvdw);
315
316             /* Calculate temporary vectorial force */
317             tx               = _mm256_mul_ps(fscal,dx00);
318             ty               = _mm256_mul_ps(fscal,dy00);
319             tz               = _mm256_mul_ps(fscal,dz00);
320
321             /* Update vectorial force */
322             fix0             = _mm256_add_ps(fix0,tx);
323             fiy0             = _mm256_add_ps(fiy0,ty);
324             fiz0             = _mm256_add_ps(fiz0,tz);
325
326             fjx0             = _mm256_add_ps(fjx0,tx);
327             fjy0             = _mm256_add_ps(fjy0,ty);
328             fjz0             = _mm256_add_ps(fjz0,tz);
329
330             /**************************
331              * CALCULATE INTERACTIONS *
332              **************************/
333
334             r10              = _mm256_mul_ps(rsq10,rinv10);
335
336             /* Compute parameters for interactions between i and j atoms */
337             qq10             = _mm256_mul_ps(iq1,jq0);
338
339             /* EWALD ELECTROSTATICS */
340             
341             /* Analytical PME correction */
342             zeta2            = _mm256_mul_ps(beta2,rsq10);
343             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
344             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
345             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
346             felec            = _mm256_mul_ps(qq10,felec);
347             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
348             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
349             velec            = _mm256_sub_ps(rinv10,pmecorrV);
350             velec            = _mm256_mul_ps(qq10,velec);
351             
352             /* Update potential sum for this i atom from the interaction with this j atom. */
353             velecsum         = _mm256_add_ps(velecsum,velec);
354
355             fscal            = felec;
356
357             /* Calculate temporary vectorial force */
358             tx               = _mm256_mul_ps(fscal,dx10);
359             ty               = _mm256_mul_ps(fscal,dy10);
360             tz               = _mm256_mul_ps(fscal,dz10);
361
362             /* Update vectorial force */
363             fix1             = _mm256_add_ps(fix1,tx);
364             fiy1             = _mm256_add_ps(fiy1,ty);
365             fiz1             = _mm256_add_ps(fiz1,tz);
366
367             fjx0             = _mm256_add_ps(fjx0,tx);
368             fjy0             = _mm256_add_ps(fjy0,ty);
369             fjz0             = _mm256_add_ps(fjz0,tz);
370
371             /**************************
372              * CALCULATE INTERACTIONS *
373              **************************/
374
375             r20              = _mm256_mul_ps(rsq20,rinv20);
376
377             /* Compute parameters for interactions between i and j atoms */
378             qq20             = _mm256_mul_ps(iq2,jq0);
379
380             /* EWALD ELECTROSTATICS */
381             
382             /* Analytical PME correction */
383             zeta2            = _mm256_mul_ps(beta2,rsq20);
384             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
385             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
386             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
387             felec            = _mm256_mul_ps(qq20,felec);
388             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
389             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
390             velec            = _mm256_sub_ps(rinv20,pmecorrV);
391             velec            = _mm256_mul_ps(qq20,velec);
392             
393             /* Update potential sum for this i atom from the interaction with this j atom. */
394             velecsum         = _mm256_add_ps(velecsum,velec);
395
396             fscal            = felec;
397
398             /* Calculate temporary vectorial force */
399             tx               = _mm256_mul_ps(fscal,dx20);
400             ty               = _mm256_mul_ps(fscal,dy20);
401             tz               = _mm256_mul_ps(fscal,dz20);
402
403             /* Update vectorial force */
404             fix2             = _mm256_add_ps(fix2,tx);
405             fiy2             = _mm256_add_ps(fiy2,ty);
406             fiz2             = _mm256_add_ps(fiz2,tz);
407
408             fjx0             = _mm256_add_ps(fjx0,tx);
409             fjy0             = _mm256_add_ps(fjy0,ty);
410             fjz0             = _mm256_add_ps(fjz0,tz);
411
412             fjptrA             = f+j_coord_offsetA;
413             fjptrB             = f+j_coord_offsetB;
414             fjptrC             = f+j_coord_offsetC;
415             fjptrD             = f+j_coord_offsetD;
416             fjptrE             = f+j_coord_offsetE;
417             fjptrF             = f+j_coord_offsetF;
418             fjptrG             = f+j_coord_offsetG;
419             fjptrH             = f+j_coord_offsetH;
420
421             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
422
423             /* Inner loop uses 267 flops */
424         }
425
426         if(jidx<j_index_end)
427         {
428
429             /* Get j neighbor index, and coordinate index */
430             jnrlistA         = jjnr[jidx];
431             jnrlistB         = jjnr[jidx+1];
432             jnrlistC         = jjnr[jidx+2];
433             jnrlistD         = jjnr[jidx+3];
434             jnrlistE         = jjnr[jidx+4];
435             jnrlistF         = jjnr[jidx+5];
436             jnrlistG         = jjnr[jidx+6];
437             jnrlistH         = jjnr[jidx+7];
438             /* Sign of each element will be negative for non-real atoms.
439              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
440              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
441              */
442             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
443                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
444                                             
445             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
446             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
447             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
448             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
449             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
450             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
451             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
452             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
453             j_coord_offsetA  = DIM*jnrA;
454             j_coord_offsetB  = DIM*jnrB;
455             j_coord_offsetC  = DIM*jnrC;
456             j_coord_offsetD  = DIM*jnrD;
457             j_coord_offsetE  = DIM*jnrE;
458             j_coord_offsetF  = DIM*jnrF;
459             j_coord_offsetG  = DIM*jnrG;
460             j_coord_offsetH  = DIM*jnrH;
461
462             /* load j atom coordinates */
463             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
464                                                  x+j_coord_offsetC,x+j_coord_offsetD,
465                                                  x+j_coord_offsetE,x+j_coord_offsetF,
466                                                  x+j_coord_offsetG,x+j_coord_offsetH,
467                                                  &jx0,&jy0,&jz0);
468
469             /* Calculate displacement vector */
470             dx00             = _mm256_sub_ps(ix0,jx0);
471             dy00             = _mm256_sub_ps(iy0,jy0);
472             dz00             = _mm256_sub_ps(iz0,jz0);
473             dx10             = _mm256_sub_ps(ix1,jx0);
474             dy10             = _mm256_sub_ps(iy1,jy0);
475             dz10             = _mm256_sub_ps(iz1,jz0);
476             dx20             = _mm256_sub_ps(ix2,jx0);
477             dy20             = _mm256_sub_ps(iy2,jy0);
478             dz20             = _mm256_sub_ps(iz2,jz0);
479
480             /* Calculate squared distance and things based on it */
481             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
482             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
483             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
484
485             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
486             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
487             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
488
489             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
490             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
491             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
492
493             /* Load parameters for j particles */
494             jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
495                                                                  charge+jnrC+0,charge+jnrD+0,
496                                                                  charge+jnrE+0,charge+jnrF+0,
497                                                                  charge+jnrG+0,charge+jnrH+0);
498             vdwjidx0A        = 2*vdwtype[jnrA+0];
499             vdwjidx0B        = 2*vdwtype[jnrB+0];
500             vdwjidx0C        = 2*vdwtype[jnrC+0];
501             vdwjidx0D        = 2*vdwtype[jnrD+0];
502             vdwjidx0E        = 2*vdwtype[jnrE+0];
503             vdwjidx0F        = 2*vdwtype[jnrF+0];
504             vdwjidx0G        = 2*vdwtype[jnrG+0];
505             vdwjidx0H        = 2*vdwtype[jnrH+0];
506
507             fjx0             = _mm256_setzero_ps();
508             fjy0             = _mm256_setzero_ps();
509             fjz0             = _mm256_setzero_ps();
510
511             /**************************
512              * CALCULATE INTERACTIONS *
513              **************************/
514
515             r00              = _mm256_mul_ps(rsq00,rinv00);
516             r00              = _mm256_andnot_ps(dummy_mask,r00);
517
518             /* Compute parameters for interactions between i and j atoms */
519             qq00             = _mm256_mul_ps(iq0,jq0);
520             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
521                                             vdwioffsetptr0+vdwjidx0B,
522                                             vdwioffsetptr0+vdwjidx0C,
523                                             vdwioffsetptr0+vdwjidx0D,
524                                             vdwioffsetptr0+vdwjidx0E,
525                                             vdwioffsetptr0+vdwjidx0F,
526                                             vdwioffsetptr0+vdwjidx0G,
527                                             vdwioffsetptr0+vdwjidx0H,
528                                             &c6_00,&c12_00);
529
530             /* EWALD ELECTROSTATICS */
531             
532             /* Analytical PME correction */
533             zeta2            = _mm256_mul_ps(beta2,rsq00);
534             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
535             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
536             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
537             felec            = _mm256_mul_ps(qq00,felec);
538             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
539             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
540             velec            = _mm256_sub_ps(rinv00,pmecorrV);
541             velec            = _mm256_mul_ps(qq00,velec);
542             
543             /* LENNARD-JONES DISPERSION/REPULSION */
544
545             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
546             vvdw6            = _mm256_mul_ps(c6_00,rinvsix);
547             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
548             vvdw             = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
549             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
550
551             /* Update potential sum for this i atom from the interaction with this j atom. */
552             velec            = _mm256_andnot_ps(dummy_mask,velec);
553             velecsum         = _mm256_add_ps(velecsum,velec);
554             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
555             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
556
557             fscal            = _mm256_add_ps(felec,fvdw);
558
559             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
560
561             /* Calculate temporary vectorial force */
562             tx               = _mm256_mul_ps(fscal,dx00);
563             ty               = _mm256_mul_ps(fscal,dy00);
564             tz               = _mm256_mul_ps(fscal,dz00);
565
566             /* Update vectorial force */
567             fix0             = _mm256_add_ps(fix0,tx);
568             fiy0             = _mm256_add_ps(fiy0,ty);
569             fiz0             = _mm256_add_ps(fiz0,tz);
570
571             fjx0             = _mm256_add_ps(fjx0,tx);
572             fjy0             = _mm256_add_ps(fjy0,ty);
573             fjz0             = _mm256_add_ps(fjz0,tz);
574
575             /**************************
576              * CALCULATE INTERACTIONS *
577              **************************/
578
579             r10              = _mm256_mul_ps(rsq10,rinv10);
580             r10              = _mm256_andnot_ps(dummy_mask,r10);
581
582             /* Compute parameters for interactions between i and j atoms */
583             qq10             = _mm256_mul_ps(iq1,jq0);
584
585             /* EWALD ELECTROSTATICS */
586             
587             /* Analytical PME correction */
588             zeta2            = _mm256_mul_ps(beta2,rsq10);
589             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
590             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
591             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
592             felec            = _mm256_mul_ps(qq10,felec);
593             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
594             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
595             velec            = _mm256_sub_ps(rinv10,pmecorrV);
596             velec            = _mm256_mul_ps(qq10,velec);
597             
598             /* Update potential sum for this i atom from the interaction with this j atom. */
599             velec            = _mm256_andnot_ps(dummy_mask,velec);
600             velecsum         = _mm256_add_ps(velecsum,velec);
601
602             fscal            = felec;
603
604             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
605
606             /* Calculate temporary vectorial force */
607             tx               = _mm256_mul_ps(fscal,dx10);
608             ty               = _mm256_mul_ps(fscal,dy10);
609             tz               = _mm256_mul_ps(fscal,dz10);
610
611             /* Update vectorial force */
612             fix1             = _mm256_add_ps(fix1,tx);
613             fiy1             = _mm256_add_ps(fiy1,ty);
614             fiz1             = _mm256_add_ps(fiz1,tz);
615
616             fjx0             = _mm256_add_ps(fjx0,tx);
617             fjy0             = _mm256_add_ps(fjy0,ty);
618             fjz0             = _mm256_add_ps(fjz0,tz);
619
620             /**************************
621              * CALCULATE INTERACTIONS *
622              **************************/
623
624             r20              = _mm256_mul_ps(rsq20,rinv20);
625             r20              = _mm256_andnot_ps(dummy_mask,r20);
626
627             /* Compute parameters for interactions between i and j atoms */
628             qq20             = _mm256_mul_ps(iq2,jq0);
629
630             /* EWALD ELECTROSTATICS */
631             
632             /* Analytical PME correction */
633             zeta2            = _mm256_mul_ps(beta2,rsq20);
634             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
635             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
636             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
637             felec            = _mm256_mul_ps(qq20,felec);
638             pmecorrV         = gmx_mm256_pmecorrV_ps(zeta2);
639             pmecorrV         = _mm256_mul_ps(pmecorrV,beta);
640             velec            = _mm256_sub_ps(rinv20,pmecorrV);
641             velec            = _mm256_mul_ps(qq20,velec);
642             
643             /* Update potential sum for this i atom from the interaction with this j atom. */
644             velec            = _mm256_andnot_ps(dummy_mask,velec);
645             velecsum         = _mm256_add_ps(velecsum,velec);
646
647             fscal            = felec;
648
649             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
650
651             /* Calculate temporary vectorial force */
652             tx               = _mm256_mul_ps(fscal,dx20);
653             ty               = _mm256_mul_ps(fscal,dy20);
654             tz               = _mm256_mul_ps(fscal,dz20);
655
656             /* Update vectorial force */
657             fix2             = _mm256_add_ps(fix2,tx);
658             fiy2             = _mm256_add_ps(fiy2,ty);
659             fiz2             = _mm256_add_ps(fiz2,tz);
660
661             fjx0             = _mm256_add_ps(fjx0,tx);
662             fjy0             = _mm256_add_ps(fjy0,ty);
663             fjz0             = _mm256_add_ps(fjz0,tz);
664
665             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
666             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
667             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
668             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
669             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
670             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
671             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
672             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
673
674             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
675
676             /* Inner loop uses 270 flops */
677         }
678
679         /* End of innermost loop */
680
681         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
682                                                  f+i_coord_offset,fshift+i_shift_offset);
683
684         ggid                        = gid[iidx];
685         /* Update potential energies */
686         gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
687         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
688
689         /* Increment number of inner iterations */
690         inneriter                  += j_index_end - j_index_start;
691
692         /* Outer loop uses 20 flops */
693     }
694
695     /* Increment number of outer iterations */
696     outeriter        += nri;
697
698     /* Update outer/inner flops */
699
700     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*270);
701 }
702 /*
703  * Gromacs nonbonded kernel:   nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_256_single
704  * Electrostatics interaction: Ewald
705  * VdW interaction:            LennardJones
706  * Geometry:                   Water3-Particle
707  * Calculate force/pot:        Force
708  */
709 void
710 nb_kernel_ElecEw_VdwLJ_GeomW3P1_F_avx_256_single
711                     (t_nblist                    * gmx_restrict       nlist,
712                      rvec                        * gmx_restrict          xx,
713                      rvec                        * gmx_restrict          ff,
714                      t_forcerec                  * gmx_restrict          fr,
715                      t_mdatoms                   * gmx_restrict     mdatoms,
716                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
717                      t_nrnb                      * gmx_restrict        nrnb)
718 {
719     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
720      * just 0 for non-waters.
721      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
722      * jnr indices corresponding to data put in the four positions in the SIMD register.
723      */
724     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
725     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
726     int              jnrA,jnrB,jnrC,jnrD;
727     int              jnrE,jnrF,jnrG,jnrH;
728     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
729     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
730     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
731     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
732     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
733     real             rcutoff_scalar;
734     real             *shiftvec,*fshift,*x,*f;
735     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
736     real             scratch[4*DIM];
737     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
738     real *           vdwioffsetptr0;
739     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
740     real *           vdwioffsetptr1;
741     __m256           ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
742     real *           vdwioffsetptr2;
743     __m256           ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
744     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
745     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
746     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
747     __m256           dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
748     __m256           dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
749     __m256           velec,felec,velecsum,facel,crf,krf,krf2;
750     real             *charge;
751     int              nvdwtype;
752     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
753     int              *vdwtype;
754     real             *vdwparam;
755     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
756     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
757     __m256i          ewitab;
758     __m128i          ewitab_lo,ewitab_hi;
759     __m256           ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
760     __m256           beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
761     real             *ewtab;
762     __m256           dummy_mask,cutoff_mask;
763     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
764     __m256           one     = _mm256_set1_ps(1.0);
765     __m256           two     = _mm256_set1_ps(2.0);
766     x                = xx[0];
767     f                = ff[0];
768
769     nri              = nlist->nri;
770     iinr             = nlist->iinr;
771     jindex           = nlist->jindex;
772     jjnr             = nlist->jjnr;
773     shiftidx         = nlist->shift;
774     gid              = nlist->gid;
775     shiftvec         = fr->shift_vec[0];
776     fshift           = fr->fshift[0];
777     facel            = _mm256_set1_ps(fr->epsfac);
778     charge           = mdatoms->chargeA;
779     nvdwtype         = fr->ntype;
780     vdwparam         = fr->nbfp;
781     vdwtype          = mdatoms->typeA;
782
783     sh_ewald         = _mm256_set1_ps(fr->ic->sh_ewald);
784     beta             = _mm256_set1_ps(fr->ic->ewaldcoeff_q);
785     beta2            = _mm256_mul_ps(beta,beta);
786     beta3            = _mm256_mul_ps(beta,beta2);
787
788     ewtab            = fr->ic->tabq_coul_F;
789     ewtabscale       = _mm256_set1_ps(fr->ic->tabq_scale);
790     ewtabhalfspace   = _mm256_set1_ps(0.5/fr->ic->tabq_scale);
791
792     /* Setup water-specific parameters */
793     inr              = nlist->iinr[0];
794     iq0              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
795     iq1              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
796     iq2              = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
797     vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
798
799     /* Avoid stupid compiler warnings */
800     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
801     j_coord_offsetA = 0;
802     j_coord_offsetB = 0;
803     j_coord_offsetC = 0;
804     j_coord_offsetD = 0;
805     j_coord_offsetE = 0;
806     j_coord_offsetF = 0;
807     j_coord_offsetG = 0;
808     j_coord_offsetH = 0;
809
810     outeriter        = 0;
811     inneriter        = 0;
812
813     for(iidx=0;iidx<4*DIM;iidx++)
814     {
815         scratch[iidx] = 0.0;
816     }
817
818     /* Start outer loop over neighborlists */
819     for(iidx=0; iidx<nri; iidx++)
820     {
821         /* Load shift vector for this list */
822         i_shift_offset   = DIM*shiftidx[iidx];
823
824         /* Load limits for loop over neighbors */
825         j_index_start    = jindex[iidx];
826         j_index_end      = jindex[iidx+1];
827
828         /* Get outer coordinate index */
829         inr              = iinr[iidx];
830         i_coord_offset   = DIM*inr;
831
832         /* Load i particle coords and add shift vector */
833         gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
834                                                     &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
835
836         fix0             = _mm256_setzero_ps();
837         fiy0             = _mm256_setzero_ps();
838         fiz0             = _mm256_setzero_ps();
839         fix1             = _mm256_setzero_ps();
840         fiy1             = _mm256_setzero_ps();
841         fiz1             = _mm256_setzero_ps();
842         fix2             = _mm256_setzero_ps();
843         fiy2             = _mm256_setzero_ps();
844         fiz2             = _mm256_setzero_ps();
845
846         /* Start inner kernel loop */
847         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
848         {
849
850             /* Get j neighbor index, and coordinate index */
851             jnrA             = jjnr[jidx];
852             jnrB             = jjnr[jidx+1];
853             jnrC             = jjnr[jidx+2];
854             jnrD             = jjnr[jidx+3];
855             jnrE             = jjnr[jidx+4];
856             jnrF             = jjnr[jidx+5];
857             jnrG             = jjnr[jidx+6];
858             jnrH             = jjnr[jidx+7];
859             j_coord_offsetA  = DIM*jnrA;
860             j_coord_offsetB  = DIM*jnrB;
861             j_coord_offsetC  = DIM*jnrC;
862             j_coord_offsetD  = DIM*jnrD;
863             j_coord_offsetE  = DIM*jnrE;
864             j_coord_offsetF  = DIM*jnrF;
865             j_coord_offsetG  = DIM*jnrG;
866             j_coord_offsetH  = DIM*jnrH;
867
868             /* load j atom coordinates */
869             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
870                                                  x+j_coord_offsetC,x+j_coord_offsetD,
871                                                  x+j_coord_offsetE,x+j_coord_offsetF,
872                                                  x+j_coord_offsetG,x+j_coord_offsetH,
873                                                  &jx0,&jy0,&jz0);
874
875             /* Calculate displacement vector */
876             dx00             = _mm256_sub_ps(ix0,jx0);
877             dy00             = _mm256_sub_ps(iy0,jy0);
878             dz00             = _mm256_sub_ps(iz0,jz0);
879             dx10             = _mm256_sub_ps(ix1,jx0);
880             dy10             = _mm256_sub_ps(iy1,jy0);
881             dz10             = _mm256_sub_ps(iz1,jz0);
882             dx20             = _mm256_sub_ps(ix2,jx0);
883             dy20             = _mm256_sub_ps(iy2,jy0);
884             dz20             = _mm256_sub_ps(iz2,jz0);
885
886             /* Calculate squared distance and things based on it */
887             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
888             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
889             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
890
891             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
892             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
893             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
894
895             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
896             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
897             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
898
899             /* Load parameters for j particles */
900             jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
901                                                                  charge+jnrC+0,charge+jnrD+0,
902                                                                  charge+jnrE+0,charge+jnrF+0,
903                                                                  charge+jnrG+0,charge+jnrH+0);
904             vdwjidx0A        = 2*vdwtype[jnrA+0];
905             vdwjidx0B        = 2*vdwtype[jnrB+0];
906             vdwjidx0C        = 2*vdwtype[jnrC+0];
907             vdwjidx0D        = 2*vdwtype[jnrD+0];
908             vdwjidx0E        = 2*vdwtype[jnrE+0];
909             vdwjidx0F        = 2*vdwtype[jnrF+0];
910             vdwjidx0G        = 2*vdwtype[jnrG+0];
911             vdwjidx0H        = 2*vdwtype[jnrH+0];
912
913             fjx0             = _mm256_setzero_ps();
914             fjy0             = _mm256_setzero_ps();
915             fjz0             = _mm256_setzero_ps();
916
917             /**************************
918              * CALCULATE INTERACTIONS *
919              **************************/
920
921             r00              = _mm256_mul_ps(rsq00,rinv00);
922
923             /* Compute parameters for interactions between i and j atoms */
924             qq00             = _mm256_mul_ps(iq0,jq0);
925             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
926                                             vdwioffsetptr0+vdwjidx0B,
927                                             vdwioffsetptr0+vdwjidx0C,
928                                             vdwioffsetptr0+vdwjidx0D,
929                                             vdwioffsetptr0+vdwjidx0E,
930                                             vdwioffsetptr0+vdwjidx0F,
931                                             vdwioffsetptr0+vdwjidx0G,
932                                             vdwioffsetptr0+vdwjidx0H,
933                                             &c6_00,&c12_00);
934
935             /* EWALD ELECTROSTATICS */
936             
937             /* Analytical PME correction */
938             zeta2            = _mm256_mul_ps(beta2,rsq00);
939             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
940             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
941             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
942             felec            = _mm256_mul_ps(qq00,felec);
943             
944             /* LENNARD-JONES DISPERSION/REPULSION */
945
946             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
947             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
948
949             fscal            = _mm256_add_ps(felec,fvdw);
950
951             /* Calculate temporary vectorial force */
952             tx               = _mm256_mul_ps(fscal,dx00);
953             ty               = _mm256_mul_ps(fscal,dy00);
954             tz               = _mm256_mul_ps(fscal,dz00);
955
956             /* Update vectorial force */
957             fix0             = _mm256_add_ps(fix0,tx);
958             fiy0             = _mm256_add_ps(fiy0,ty);
959             fiz0             = _mm256_add_ps(fiz0,tz);
960
961             fjx0             = _mm256_add_ps(fjx0,tx);
962             fjy0             = _mm256_add_ps(fjy0,ty);
963             fjz0             = _mm256_add_ps(fjz0,tz);
964
965             /**************************
966              * CALCULATE INTERACTIONS *
967              **************************/
968
969             r10              = _mm256_mul_ps(rsq10,rinv10);
970
971             /* Compute parameters for interactions between i and j atoms */
972             qq10             = _mm256_mul_ps(iq1,jq0);
973
974             /* EWALD ELECTROSTATICS */
975             
976             /* Analytical PME correction */
977             zeta2            = _mm256_mul_ps(beta2,rsq10);
978             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
979             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
980             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
981             felec            = _mm256_mul_ps(qq10,felec);
982             
983             fscal            = felec;
984
985             /* Calculate temporary vectorial force */
986             tx               = _mm256_mul_ps(fscal,dx10);
987             ty               = _mm256_mul_ps(fscal,dy10);
988             tz               = _mm256_mul_ps(fscal,dz10);
989
990             /* Update vectorial force */
991             fix1             = _mm256_add_ps(fix1,tx);
992             fiy1             = _mm256_add_ps(fiy1,ty);
993             fiz1             = _mm256_add_ps(fiz1,tz);
994
995             fjx0             = _mm256_add_ps(fjx0,tx);
996             fjy0             = _mm256_add_ps(fjy0,ty);
997             fjz0             = _mm256_add_ps(fjz0,tz);
998
999             /**************************
1000              * CALCULATE INTERACTIONS *
1001              **************************/
1002
1003             r20              = _mm256_mul_ps(rsq20,rinv20);
1004
1005             /* Compute parameters for interactions between i and j atoms */
1006             qq20             = _mm256_mul_ps(iq2,jq0);
1007
1008             /* EWALD ELECTROSTATICS */
1009             
1010             /* Analytical PME correction */
1011             zeta2            = _mm256_mul_ps(beta2,rsq20);
1012             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1013             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1014             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1015             felec            = _mm256_mul_ps(qq20,felec);
1016             
1017             fscal            = felec;
1018
1019             /* Calculate temporary vectorial force */
1020             tx               = _mm256_mul_ps(fscal,dx20);
1021             ty               = _mm256_mul_ps(fscal,dy20);
1022             tz               = _mm256_mul_ps(fscal,dz20);
1023
1024             /* Update vectorial force */
1025             fix2             = _mm256_add_ps(fix2,tx);
1026             fiy2             = _mm256_add_ps(fiy2,ty);
1027             fiz2             = _mm256_add_ps(fiz2,tz);
1028
1029             fjx0             = _mm256_add_ps(fjx0,tx);
1030             fjy0             = _mm256_add_ps(fjy0,ty);
1031             fjz0             = _mm256_add_ps(fjz0,tz);
1032
1033             fjptrA             = f+j_coord_offsetA;
1034             fjptrB             = f+j_coord_offsetB;
1035             fjptrC             = f+j_coord_offsetC;
1036             fjptrD             = f+j_coord_offsetD;
1037             fjptrE             = f+j_coord_offsetE;
1038             fjptrF             = f+j_coord_offsetF;
1039             fjptrG             = f+j_coord_offsetG;
1040             fjptrH             = f+j_coord_offsetH;
1041
1042             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1043
1044             /* Inner loop uses 178 flops */
1045         }
1046
1047         if(jidx<j_index_end)
1048         {
1049
1050             /* Get j neighbor index, and coordinate index */
1051             jnrlistA         = jjnr[jidx];
1052             jnrlistB         = jjnr[jidx+1];
1053             jnrlistC         = jjnr[jidx+2];
1054             jnrlistD         = jjnr[jidx+3];
1055             jnrlistE         = jjnr[jidx+4];
1056             jnrlistF         = jjnr[jidx+5];
1057             jnrlistG         = jjnr[jidx+6];
1058             jnrlistH         = jjnr[jidx+7];
1059             /* Sign of each element will be negative for non-real atoms.
1060              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1061              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1062              */
1063             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1064                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1065                                             
1066             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
1067             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
1068             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
1069             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
1070             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
1071             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
1072             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
1073             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
1074             j_coord_offsetA  = DIM*jnrA;
1075             j_coord_offsetB  = DIM*jnrB;
1076             j_coord_offsetC  = DIM*jnrC;
1077             j_coord_offsetD  = DIM*jnrD;
1078             j_coord_offsetE  = DIM*jnrE;
1079             j_coord_offsetF  = DIM*jnrF;
1080             j_coord_offsetG  = DIM*jnrG;
1081             j_coord_offsetH  = DIM*jnrH;
1082
1083             /* load j atom coordinates */
1084             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1085                                                  x+j_coord_offsetC,x+j_coord_offsetD,
1086                                                  x+j_coord_offsetE,x+j_coord_offsetF,
1087                                                  x+j_coord_offsetG,x+j_coord_offsetH,
1088                                                  &jx0,&jy0,&jz0);
1089
1090             /* Calculate displacement vector */
1091             dx00             = _mm256_sub_ps(ix0,jx0);
1092             dy00             = _mm256_sub_ps(iy0,jy0);
1093             dz00             = _mm256_sub_ps(iz0,jz0);
1094             dx10             = _mm256_sub_ps(ix1,jx0);
1095             dy10             = _mm256_sub_ps(iy1,jy0);
1096             dz10             = _mm256_sub_ps(iz1,jz0);
1097             dx20             = _mm256_sub_ps(ix2,jx0);
1098             dy20             = _mm256_sub_ps(iy2,jy0);
1099             dz20             = _mm256_sub_ps(iz2,jz0);
1100
1101             /* Calculate squared distance and things based on it */
1102             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1103             rsq10            = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1104             rsq20            = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1105
1106             rinv00           = gmx_mm256_invsqrt_ps(rsq00);
1107             rinv10           = gmx_mm256_invsqrt_ps(rsq10);
1108             rinv20           = gmx_mm256_invsqrt_ps(rsq20);
1109
1110             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
1111             rinvsq10         = _mm256_mul_ps(rinv10,rinv10);
1112             rinvsq20         = _mm256_mul_ps(rinv20,rinv20);
1113
1114             /* Load parameters for j particles */
1115             jq0              = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1116                                                                  charge+jnrC+0,charge+jnrD+0,
1117                                                                  charge+jnrE+0,charge+jnrF+0,
1118                                                                  charge+jnrG+0,charge+jnrH+0);
1119             vdwjidx0A        = 2*vdwtype[jnrA+0];
1120             vdwjidx0B        = 2*vdwtype[jnrB+0];
1121             vdwjidx0C        = 2*vdwtype[jnrC+0];
1122             vdwjidx0D        = 2*vdwtype[jnrD+0];
1123             vdwjidx0E        = 2*vdwtype[jnrE+0];
1124             vdwjidx0F        = 2*vdwtype[jnrF+0];
1125             vdwjidx0G        = 2*vdwtype[jnrG+0];
1126             vdwjidx0H        = 2*vdwtype[jnrH+0];
1127
1128             fjx0             = _mm256_setzero_ps();
1129             fjy0             = _mm256_setzero_ps();
1130             fjz0             = _mm256_setzero_ps();
1131
1132             /**************************
1133              * CALCULATE INTERACTIONS *
1134              **************************/
1135
1136             r00              = _mm256_mul_ps(rsq00,rinv00);
1137             r00              = _mm256_andnot_ps(dummy_mask,r00);
1138
1139             /* Compute parameters for interactions between i and j atoms */
1140             qq00             = _mm256_mul_ps(iq0,jq0);
1141             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1142                                             vdwioffsetptr0+vdwjidx0B,
1143                                             vdwioffsetptr0+vdwjidx0C,
1144                                             vdwioffsetptr0+vdwjidx0D,
1145                                             vdwioffsetptr0+vdwjidx0E,
1146                                             vdwioffsetptr0+vdwjidx0F,
1147                                             vdwioffsetptr0+vdwjidx0G,
1148                                             vdwioffsetptr0+vdwjidx0H,
1149                                             &c6_00,&c12_00);
1150
1151             /* EWALD ELECTROSTATICS */
1152             
1153             /* Analytical PME correction */
1154             zeta2            = _mm256_mul_ps(beta2,rsq00);
1155             rinv3            = _mm256_mul_ps(rinvsq00,rinv00);
1156             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1157             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1158             felec            = _mm256_mul_ps(qq00,felec);
1159             
1160             /* LENNARD-JONES DISPERSION/REPULSION */
1161
1162             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1163             fvdw             = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1164
1165             fscal            = _mm256_add_ps(felec,fvdw);
1166
1167             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1168
1169             /* Calculate temporary vectorial force */
1170             tx               = _mm256_mul_ps(fscal,dx00);
1171             ty               = _mm256_mul_ps(fscal,dy00);
1172             tz               = _mm256_mul_ps(fscal,dz00);
1173
1174             /* Update vectorial force */
1175             fix0             = _mm256_add_ps(fix0,tx);
1176             fiy0             = _mm256_add_ps(fiy0,ty);
1177             fiz0             = _mm256_add_ps(fiz0,tz);
1178
1179             fjx0             = _mm256_add_ps(fjx0,tx);
1180             fjy0             = _mm256_add_ps(fjy0,ty);
1181             fjz0             = _mm256_add_ps(fjz0,tz);
1182
1183             /**************************
1184              * CALCULATE INTERACTIONS *
1185              **************************/
1186
1187             r10              = _mm256_mul_ps(rsq10,rinv10);
1188             r10              = _mm256_andnot_ps(dummy_mask,r10);
1189
1190             /* Compute parameters for interactions between i and j atoms */
1191             qq10             = _mm256_mul_ps(iq1,jq0);
1192
1193             /* EWALD ELECTROSTATICS */
1194             
1195             /* Analytical PME correction */
1196             zeta2            = _mm256_mul_ps(beta2,rsq10);
1197             rinv3            = _mm256_mul_ps(rinvsq10,rinv10);
1198             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1199             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1200             felec            = _mm256_mul_ps(qq10,felec);
1201             
1202             fscal            = felec;
1203
1204             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1205
1206             /* Calculate temporary vectorial force */
1207             tx               = _mm256_mul_ps(fscal,dx10);
1208             ty               = _mm256_mul_ps(fscal,dy10);
1209             tz               = _mm256_mul_ps(fscal,dz10);
1210
1211             /* Update vectorial force */
1212             fix1             = _mm256_add_ps(fix1,tx);
1213             fiy1             = _mm256_add_ps(fiy1,ty);
1214             fiz1             = _mm256_add_ps(fiz1,tz);
1215
1216             fjx0             = _mm256_add_ps(fjx0,tx);
1217             fjy0             = _mm256_add_ps(fjy0,ty);
1218             fjz0             = _mm256_add_ps(fjz0,tz);
1219
1220             /**************************
1221              * CALCULATE INTERACTIONS *
1222              **************************/
1223
1224             r20              = _mm256_mul_ps(rsq20,rinv20);
1225             r20              = _mm256_andnot_ps(dummy_mask,r20);
1226
1227             /* Compute parameters for interactions between i and j atoms */
1228             qq20             = _mm256_mul_ps(iq2,jq0);
1229
1230             /* EWALD ELECTROSTATICS */
1231             
1232             /* Analytical PME correction */
1233             zeta2            = _mm256_mul_ps(beta2,rsq20);
1234             rinv3            = _mm256_mul_ps(rinvsq20,rinv20);
1235             pmecorrF         = gmx_mm256_pmecorrF_ps(zeta2);
1236             felec            = _mm256_add_ps( _mm256_mul_ps(pmecorrF,beta3), rinv3);
1237             felec            = _mm256_mul_ps(qq20,felec);
1238             
1239             fscal            = felec;
1240
1241             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
1242
1243             /* Calculate temporary vectorial force */
1244             tx               = _mm256_mul_ps(fscal,dx20);
1245             ty               = _mm256_mul_ps(fscal,dy20);
1246             tz               = _mm256_mul_ps(fscal,dz20);
1247
1248             /* Update vectorial force */
1249             fix2             = _mm256_add_ps(fix2,tx);
1250             fiy2             = _mm256_add_ps(fiy2,ty);
1251             fiz2             = _mm256_add_ps(fiz2,tz);
1252
1253             fjx0             = _mm256_add_ps(fjx0,tx);
1254             fjy0             = _mm256_add_ps(fjy0,ty);
1255             fjz0             = _mm256_add_ps(fjz0,tz);
1256
1257             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1258             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1259             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1260             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1261             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1262             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1263             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1264             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1265
1266             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1267
1268             /* Inner loop uses 181 flops */
1269         }
1270
1271         /* End of innermost loop */
1272
1273         gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1274                                                  f+i_coord_offset,fshift+i_shift_offset);
1275
1276         /* Increment number of inner iterations */
1277         inneriter                  += j_index_end - j_index_start;
1278
1279         /* Outer loop uses 18 flops */
1280     }
1281
1282     /* Increment number of outer iterations */
1283     outeriter        += nri;
1284
1285     /* Update outer/inner flops */
1286
1287     inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*181);
1288 }