Compile nonbonded kernels as C++
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_kernel_avx_256_single / nb_kernel_ElecNone_VdwLJEw_GeomP1P1_avx_256_single.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*
36  * Note: this file was generated by the GROMACS avx_256_single kernel generator.
37  */
38 #include "gmxpre.h"
39
40 #include "config.h"
41
42 #include <math.h>
43
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
46
47 #include "kernelutil_x86_avx_256_single.h"
48
49 /*
50  * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_256_single
51  * Electrostatics interaction: None
52  * VdW interaction:            LJEwald
53  * Geometry:                   Particle-Particle
54  * Calculate force/pot:        PotentialAndForce
55  */
56 void
57 nb_kernel_ElecNone_VdwLJEw_GeomP1P1_VF_avx_256_single
58                     (t_nblist                    * gmx_restrict       nlist,
59                      rvec                        * gmx_restrict          xx,
60                      rvec                        * gmx_restrict          ff,
61                      struct t_forcerec           * gmx_restrict          fr,
62                      t_mdatoms                   * gmx_restrict     mdatoms,
63                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64                      t_nrnb                      * gmx_restrict        nrnb)
65 {
66     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
67      * just 0 for non-waters.
68      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69      * jnr indices corresponding to data put in the four positions in the SIMD register.
70      */
71     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
72     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73     int              jnrA,jnrB,jnrC,jnrD;
74     int              jnrE,jnrF,jnrG,jnrH;
75     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
80     real             rcutoff_scalar;
81     real             *shiftvec,*fshift,*x,*f;
82     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
83     real             scratch[4*DIM];
84     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85     real *           vdwioffsetptr0;
86     real *           vdwgridioffsetptr0;
87     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
89     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
90     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
91     int              nvdwtype;
92     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
93     int              *vdwtype;
94     real             *vdwparam;
95     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
96     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
97     __m256           c6grid_00;
98     real             *vdwgridparam;
99     __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
100     __m256           one_half  = _mm256_set1_ps(0.5);
101     __m256           minus_one = _mm256_set1_ps(-1.0);
102     __m256           dummy_mask,cutoff_mask;
103     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
104     __m256           one     = _mm256_set1_ps(1.0);
105     __m256           two     = _mm256_set1_ps(2.0);
106     x                = xx[0];
107     f                = ff[0];
108
109     nri              = nlist->nri;
110     iinr             = nlist->iinr;
111     jindex           = nlist->jindex;
112     jjnr             = nlist->jjnr;
113     shiftidx         = nlist->shift;
114     gid              = nlist->gid;
115     shiftvec         = fr->shift_vec[0];
116     fshift           = fr->fshift[0];
117     nvdwtype         = fr->ntype;
118     vdwparam         = fr->nbfp;
119     vdwtype          = mdatoms->typeA;
120     vdwgridparam     = fr->ljpme_c6grid;
121     sh_lj_ewald      = _mm256_set1_ps(fr->ic->sh_lj_ewald);
122     ewclj            = _mm256_set1_ps(fr->ic->ewaldcoeff_lj);
123     ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
124
125     /* Avoid stupid compiler warnings */
126     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
127     j_coord_offsetA = 0;
128     j_coord_offsetB = 0;
129     j_coord_offsetC = 0;
130     j_coord_offsetD = 0;
131     j_coord_offsetE = 0;
132     j_coord_offsetF = 0;
133     j_coord_offsetG = 0;
134     j_coord_offsetH = 0;
135
136     outeriter        = 0;
137     inneriter        = 0;
138
139     for(iidx=0;iidx<4*DIM;iidx++)
140     {
141         scratch[iidx] = 0.0;
142     }
143
144     /* Start outer loop over neighborlists */
145     for(iidx=0; iidx<nri; iidx++)
146     {
147         /* Load shift vector for this list */
148         i_shift_offset   = DIM*shiftidx[iidx];
149
150         /* Load limits for loop over neighbors */
151         j_index_start    = jindex[iidx];
152         j_index_end      = jindex[iidx+1];
153
154         /* Get outer coordinate index */
155         inr              = iinr[iidx];
156         i_coord_offset   = DIM*inr;
157
158         /* Load i particle coords and add shift vector */
159         gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
160
161         fix0             = _mm256_setzero_ps();
162         fiy0             = _mm256_setzero_ps();
163         fiz0             = _mm256_setzero_ps();
164
165         /* Load parameters for i particles */
166         vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
167         vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
168
169         /* Reset potential sums */
170         vvdwsum          = _mm256_setzero_ps();
171
172         /* Start inner kernel loop */
173         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
174         {
175
176             /* Get j neighbor index, and coordinate index */
177             jnrA             = jjnr[jidx];
178             jnrB             = jjnr[jidx+1];
179             jnrC             = jjnr[jidx+2];
180             jnrD             = jjnr[jidx+3];
181             jnrE             = jjnr[jidx+4];
182             jnrF             = jjnr[jidx+5];
183             jnrG             = jjnr[jidx+6];
184             jnrH             = jjnr[jidx+7];
185             j_coord_offsetA  = DIM*jnrA;
186             j_coord_offsetB  = DIM*jnrB;
187             j_coord_offsetC  = DIM*jnrC;
188             j_coord_offsetD  = DIM*jnrD;
189             j_coord_offsetE  = DIM*jnrE;
190             j_coord_offsetF  = DIM*jnrF;
191             j_coord_offsetG  = DIM*jnrG;
192             j_coord_offsetH  = DIM*jnrH;
193
194             /* load j atom coordinates */
195             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
196                                                  x+j_coord_offsetC,x+j_coord_offsetD,
197                                                  x+j_coord_offsetE,x+j_coord_offsetF,
198                                                  x+j_coord_offsetG,x+j_coord_offsetH,
199                                                  &jx0,&jy0,&jz0);
200
201             /* Calculate displacement vector */
202             dx00             = _mm256_sub_ps(ix0,jx0);
203             dy00             = _mm256_sub_ps(iy0,jy0);
204             dz00             = _mm256_sub_ps(iz0,jz0);
205
206             /* Calculate squared distance and things based on it */
207             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
208
209             rinv00           = avx256_invsqrt_f(rsq00);
210
211             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
212
213             /* Load parameters for j particles */
214             vdwjidx0A        = 2*vdwtype[jnrA+0];
215             vdwjidx0B        = 2*vdwtype[jnrB+0];
216             vdwjidx0C        = 2*vdwtype[jnrC+0];
217             vdwjidx0D        = 2*vdwtype[jnrD+0];
218             vdwjidx0E        = 2*vdwtype[jnrE+0];
219             vdwjidx0F        = 2*vdwtype[jnrF+0];
220             vdwjidx0G        = 2*vdwtype[jnrG+0];
221             vdwjidx0H        = 2*vdwtype[jnrH+0];
222
223             /**************************
224              * CALCULATE INTERACTIONS *
225              **************************/
226
227             r00              = _mm256_mul_ps(rsq00,rinv00);
228
229             /* Compute parameters for interactions between i and j atoms */
230             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
231                                             vdwioffsetptr0+vdwjidx0B,
232                                             vdwioffsetptr0+vdwjidx0C,
233                                             vdwioffsetptr0+vdwjidx0D,
234                                             vdwioffsetptr0+vdwjidx0E,
235                                             vdwioffsetptr0+vdwjidx0F,
236                                             vdwioffsetptr0+vdwjidx0G,
237                                             vdwioffsetptr0+vdwjidx0H,
238                                             &c6_00,&c12_00);
239
240             c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
241                                                                   vdwgridioffsetptr0+vdwjidx0B,
242                                                                   vdwgridioffsetptr0+vdwjidx0C,
243                                                                   vdwgridioffsetptr0+vdwjidx0D,
244                                                                   vdwgridioffsetptr0+vdwjidx0E,
245                                                                   vdwgridioffsetptr0+vdwjidx0F,
246                                                                   vdwgridioffsetptr0+vdwjidx0G,
247                                                                   vdwgridioffsetptr0+vdwjidx0H);
248
249             /* Analytical LJ-PME */
250             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
251             ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
252             ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
253             exponent         = avx256_exp_f(ewcljrsq);
254             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
255             poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
256             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
257             vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
258             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
259             vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
260             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
261             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
262
263             /* Update potential sum for this i atom from the interaction with this j atom. */
264             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
265
266             fscal            = fvdw;
267
268             /* Calculate temporary vectorial force */
269             tx               = _mm256_mul_ps(fscal,dx00);
270             ty               = _mm256_mul_ps(fscal,dy00);
271             tz               = _mm256_mul_ps(fscal,dz00);
272
273             /* Update vectorial force */
274             fix0             = _mm256_add_ps(fix0,tx);
275             fiy0             = _mm256_add_ps(fiy0,ty);
276             fiz0             = _mm256_add_ps(fiz0,tz);
277
278             fjptrA             = f+j_coord_offsetA;
279             fjptrB             = f+j_coord_offsetB;
280             fjptrC             = f+j_coord_offsetC;
281             fjptrD             = f+j_coord_offsetD;
282             fjptrE             = f+j_coord_offsetE;
283             fjptrF             = f+j_coord_offsetF;
284             fjptrG             = f+j_coord_offsetG;
285             fjptrH             = f+j_coord_offsetH;
286             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
287
288             /* Inner loop uses 51 flops */
289         }
290
291         if(jidx<j_index_end)
292         {
293
294             /* Get j neighbor index, and coordinate index */
295             jnrlistA         = jjnr[jidx];
296             jnrlistB         = jjnr[jidx+1];
297             jnrlistC         = jjnr[jidx+2];
298             jnrlistD         = jjnr[jidx+3];
299             jnrlistE         = jjnr[jidx+4];
300             jnrlistF         = jjnr[jidx+5];
301             jnrlistG         = jjnr[jidx+6];
302             jnrlistH         = jjnr[jidx+7];
303             /* Sign of each element will be negative for non-real atoms.
304              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
305              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
306              */
307             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
308                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
309                                             
310             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
311             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
312             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
313             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
314             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
315             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
316             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
317             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
318             j_coord_offsetA  = DIM*jnrA;
319             j_coord_offsetB  = DIM*jnrB;
320             j_coord_offsetC  = DIM*jnrC;
321             j_coord_offsetD  = DIM*jnrD;
322             j_coord_offsetE  = DIM*jnrE;
323             j_coord_offsetF  = DIM*jnrF;
324             j_coord_offsetG  = DIM*jnrG;
325             j_coord_offsetH  = DIM*jnrH;
326
327             /* load j atom coordinates */
328             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
329                                                  x+j_coord_offsetC,x+j_coord_offsetD,
330                                                  x+j_coord_offsetE,x+j_coord_offsetF,
331                                                  x+j_coord_offsetG,x+j_coord_offsetH,
332                                                  &jx0,&jy0,&jz0);
333
334             /* Calculate displacement vector */
335             dx00             = _mm256_sub_ps(ix0,jx0);
336             dy00             = _mm256_sub_ps(iy0,jy0);
337             dz00             = _mm256_sub_ps(iz0,jz0);
338
339             /* Calculate squared distance and things based on it */
340             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
341
342             rinv00           = avx256_invsqrt_f(rsq00);
343
344             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
345
346             /* Load parameters for j particles */
347             vdwjidx0A        = 2*vdwtype[jnrA+0];
348             vdwjidx0B        = 2*vdwtype[jnrB+0];
349             vdwjidx0C        = 2*vdwtype[jnrC+0];
350             vdwjidx0D        = 2*vdwtype[jnrD+0];
351             vdwjidx0E        = 2*vdwtype[jnrE+0];
352             vdwjidx0F        = 2*vdwtype[jnrF+0];
353             vdwjidx0G        = 2*vdwtype[jnrG+0];
354             vdwjidx0H        = 2*vdwtype[jnrH+0];
355
356             /**************************
357              * CALCULATE INTERACTIONS *
358              **************************/
359
360             r00              = _mm256_mul_ps(rsq00,rinv00);
361             r00              = _mm256_andnot_ps(dummy_mask,r00);
362
363             /* Compute parameters for interactions between i and j atoms */
364             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
365                                             vdwioffsetptr0+vdwjidx0B,
366                                             vdwioffsetptr0+vdwjidx0C,
367                                             vdwioffsetptr0+vdwjidx0D,
368                                             vdwioffsetptr0+vdwjidx0E,
369                                             vdwioffsetptr0+vdwjidx0F,
370                                             vdwioffsetptr0+vdwjidx0G,
371                                             vdwioffsetptr0+vdwjidx0H,
372                                             &c6_00,&c12_00);
373
374             c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
375                                                                   vdwgridioffsetptr0+vdwjidx0B,
376                                                                   vdwgridioffsetptr0+vdwjidx0C,
377                                                                   vdwgridioffsetptr0+vdwjidx0D,
378                                                                   vdwgridioffsetptr0+vdwjidx0E,
379                                                                   vdwgridioffsetptr0+vdwjidx0F,
380                                                                   vdwgridioffsetptr0+vdwjidx0G,
381                                                                   vdwgridioffsetptr0+vdwjidx0H);
382
383             /* Analytical LJ-PME */
384             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
385             ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
386             ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
387             exponent         = avx256_exp_f(ewcljrsq);
388             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
389             poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
390             /* vvdw6 = [C6 - C6grid * (1-poly)]/r6 */
391             vvdw6            = _mm256_mul_ps(_mm256_sub_ps(c6_00,_mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly))),rinvsix);
392             vvdw12           = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
393             vvdw             = _mm256_sub_ps(_mm256_mul_ps(vvdw12,one_twelfth),_mm256_mul_ps(vvdw6,one_sixth));
394             /* fvdw = vvdw12/r - (vvdw6/r + (C6grid * exponent * beta^6)/r) */
395             fvdw             = _mm256_mul_ps(_mm256_sub_ps(vvdw12,_mm256_sub_ps(vvdw6,_mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6)))),rinvsq00);
396
397             /* Update potential sum for this i atom from the interaction with this j atom. */
398             vvdw             = _mm256_andnot_ps(dummy_mask,vvdw);
399             vvdwsum          = _mm256_add_ps(vvdwsum,vvdw);
400
401             fscal            = fvdw;
402
403             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
404
405             /* Calculate temporary vectorial force */
406             tx               = _mm256_mul_ps(fscal,dx00);
407             ty               = _mm256_mul_ps(fscal,dy00);
408             tz               = _mm256_mul_ps(fscal,dz00);
409
410             /* Update vectorial force */
411             fix0             = _mm256_add_ps(fix0,tx);
412             fiy0             = _mm256_add_ps(fiy0,ty);
413             fiz0             = _mm256_add_ps(fiz0,tz);
414
415             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
416             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
417             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
418             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
419             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
420             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
421             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
422             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
423             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
424
425             /* Inner loop uses 52 flops */
426         }
427
428         /* End of innermost loop */
429
430         gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
431                                                  f+i_coord_offset,fshift+i_shift_offset);
432
433         ggid                        = gid[iidx];
434         /* Update potential energies */
435         gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
436
437         /* Increment number of inner iterations */
438         inneriter                  += j_index_end - j_index_start;
439
440         /* Outer loop uses 7 flops */
441     }
442
443     /* Increment number of outer iterations */
444     outeriter        += nri;
445
446     /* Update outer/inner flops */
447
448     inc_nrnb(nrnb,eNR_NBKERNEL_VDW_VF,outeriter*7 + inneriter*52);
449 }
450 /*
451  * Gromacs nonbonded kernel:   nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_256_single
452  * Electrostatics interaction: None
453  * VdW interaction:            LJEwald
454  * Geometry:                   Particle-Particle
455  * Calculate force/pot:        Force
456  */
457 void
458 nb_kernel_ElecNone_VdwLJEw_GeomP1P1_F_avx_256_single
459                     (t_nblist                    * gmx_restrict       nlist,
460                      rvec                        * gmx_restrict          xx,
461                      rvec                        * gmx_restrict          ff,
462                      struct t_forcerec           * gmx_restrict          fr,
463                      t_mdatoms                   * gmx_restrict     mdatoms,
464                      nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
465                      t_nrnb                      * gmx_restrict        nrnb)
466 {
467     /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or 
468      * just 0 for non-waters.
469      * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
470      * jnr indices corresponding to data put in the four positions in the SIMD register.
471      */
472     int              i_shift_offset,i_coord_offset,outeriter,inneriter;
473     int              j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
474     int              jnrA,jnrB,jnrC,jnrD;
475     int              jnrE,jnrF,jnrG,jnrH;
476     int              jnrlistA,jnrlistB,jnrlistC,jnrlistD;
477     int              jnrlistE,jnrlistF,jnrlistG,jnrlistH;
478     int              j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
479     int              j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
480     int              *iinr,*jindex,*jjnr,*shiftidx,*gid;
481     real             rcutoff_scalar;
482     real             *shiftvec,*fshift,*x,*f;
483     real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
484     real             scratch[4*DIM];
485     __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
486     real *           vdwioffsetptr0;
487     real *           vdwgridioffsetptr0;
488     __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
489     int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
490     __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
491     __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
492     int              nvdwtype;
493     __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
494     int              *vdwtype;
495     real             *vdwparam;
496     __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
497     __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
498     __m256           c6grid_00;
499     real             *vdwgridparam;
500     __m256           ewclj,ewclj2,ewclj6,ewcljrsq,poly,exponent,f6A,f6B,sh_lj_ewald;
501     __m256           one_half  = _mm256_set1_ps(0.5);
502     __m256           minus_one = _mm256_set1_ps(-1.0);
503     __m256           dummy_mask,cutoff_mask;
504     __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
505     __m256           one     = _mm256_set1_ps(1.0);
506     __m256           two     = _mm256_set1_ps(2.0);
507     x                = xx[0];
508     f                = ff[0];
509
510     nri              = nlist->nri;
511     iinr             = nlist->iinr;
512     jindex           = nlist->jindex;
513     jjnr             = nlist->jjnr;
514     shiftidx         = nlist->shift;
515     gid              = nlist->gid;
516     shiftvec         = fr->shift_vec[0];
517     fshift           = fr->fshift[0];
518     nvdwtype         = fr->ntype;
519     vdwparam         = fr->nbfp;
520     vdwtype          = mdatoms->typeA;
521     vdwgridparam     = fr->ljpme_c6grid;
522     sh_lj_ewald      = _mm256_set1_ps(fr->ic->sh_lj_ewald);
523     ewclj            = _mm256_set1_ps(fr->ic->ewaldcoeff_lj);
524     ewclj2           = _mm256_mul_ps(minus_one,_mm256_mul_ps(ewclj,ewclj));
525
526     /* Avoid stupid compiler warnings */
527     jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
528     j_coord_offsetA = 0;
529     j_coord_offsetB = 0;
530     j_coord_offsetC = 0;
531     j_coord_offsetD = 0;
532     j_coord_offsetE = 0;
533     j_coord_offsetF = 0;
534     j_coord_offsetG = 0;
535     j_coord_offsetH = 0;
536
537     outeriter        = 0;
538     inneriter        = 0;
539
540     for(iidx=0;iidx<4*DIM;iidx++)
541     {
542         scratch[iidx] = 0.0;
543     }
544
545     /* Start outer loop over neighborlists */
546     for(iidx=0; iidx<nri; iidx++)
547     {
548         /* Load shift vector for this list */
549         i_shift_offset   = DIM*shiftidx[iidx];
550
551         /* Load limits for loop over neighbors */
552         j_index_start    = jindex[iidx];
553         j_index_end      = jindex[iidx+1];
554
555         /* Get outer coordinate index */
556         inr              = iinr[iidx];
557         i_coord_offset   = DIM*inr;
558
559         /* Load i particle coords and add shift vector */
560         gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
561
562         fix0             = _mm256_setzero_ps();
563         fiy0             = _mm256_setzero_ps();
564         fiz0             = _mm256_setzero_ps();
565
566         /* Load parameters for i particles */
567         vdwioffsetptr0   = vdwparam+2*nvdwtype*vdwtype[inr+0];
568         vdwgridioffsetptr0 = vdwgridparam+2*nvdwtype*vdwtype[inr+0];
569
570         /* Start inner kernel loop */
571         for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
572         {
573
574             /* Get j neighbor index, and coordinate index */
575             jnrA             = jjnr[jidx];
576             jnrB             = jjnr[jidx+1];
577             jnrC             = jjnr[jidx+2];
578             jnrD             = jjnr[jidx+3];
579             jnrE             = jjnr[jidx+4];
580             jnrF             = jjnr[jidx+5];
581             jnrG             = jjnr[jidx+6];
582             jnrH             = jjnr[jidx+7];
583             j_coord_offsetA  = DIM*jnrA;
584             j_coord_offsetB  = DIM*jnrB;
585             j_coord_offsetC  = DIM*jnrC;
586             j_coord_offsetD  = DIM*jnrD;
587             j_coord_offsetE  = DIM*jnrE;
588             j_coord_offsetF  = DIM*jnrF;
589             j_coord_offsetG  = DIM*jnrG;
590             j_coord_offsetH  = DIM*jnrH;
591
592             /* load j atom coordinates */
593             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
594                                                  x+j_coord_offsetC,x+j_coord_offsetD,
595                                                  x+j_coord_offsetE,x+j_coord_offsetF,
596                                                  x+j_coord_offsetG,x+j_coord_offsetH,
597                                                  &jx0,&jy0,&jz0);
598
599             /* Calculate displacement vector */
600             dx00             = _mm256_sub_ps(ix0,jx0);
601             dy00             = _mm256_sub_ps(iy0,jy0);
602             dz00             = _mm256_sub_ps(iz0,jz0);
603
604             /* Calculate squared distance and things based on it */
605             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
606
607             rinv00           = avx256_invsqrt_f(rsq00);
608
609             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
610
611             /* Load parameters for j particles */
612             vdwjidx0A        = 2*vdwtype[jnrA+0];
613             vdwjidx0B        = 2*vdwtype[jnrB+0];
614             vdwjidx0C        = 2*vdwtype[jnrC+0];
615             vdwjidx0D        = 2*vdwtype[jnrD+0];
616             vdwjidx0E        = 2*vdwtype[jnrE+0];
617             vdwjidx0F        = 2*vdwtype[jnrF+0];
618             vdwjidx0G        = 2*vdwtype[jnrG+0];
619             vdwjidx0H        = 2*vdwtype[jnrH+0];
620
621             /**************************
622              * CALCULATE INTERACTIONS *
623              **************************/
624
625             r00              = _mm256_mul_ps(rsq00,rinv00);
626
627             /* Compute parameters for interactions between i and j atoms */
628             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
629                                             vdwioffsetptr0+vdwjidx0B,
630                                             vdwioffsetptr0+vdwjidx0C,
631                                             vdwioffsetptr0+vdwjidx0D,
632                                             vdwioffsetptr0+vdwjidx0E,
633                                             vdwioffsetptr0+vdwjidx0F,
634                                             vdwioffsetptr0+vdwjidx0G,
635                                             vdwioffsetptr0+vdwjidx0H,
636                                             &c6_00,&c12_00);
637
638             c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
639                                                                   vdwgridioffsetptr0+vdwjidx0B,
640                                                                   vdwgridioffsetptr0+vdwjidx0C,
641                                                                   vdwgridioffsetptr0+vdwjidx0D,
642                                                                   vdwgridioffsetptr0+vdwjidx0E,
643                                                                   vdwgridioffsetptr0+vdwjidx0F,
644                                                                   vdwgridioffsetptr0+vdwjidx0G,
645                                                                   vdwgridioffsetptr0+vdwjidx0H);
646
647             /* Analytical LJ-PME */
648             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
649             ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
650             ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
651             exponent         = avx256_exp_f(ewcljrsq);
652             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
653             poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
654             /* f6A = 6 * C6grid * (1 - poly) */
655             f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
656             /* f6B = C6grid * exponent * beta^6 */
657             f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
658             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
659             fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
660
661             fscal            = fvdw;
662
663             /* Calculate temporary vectorial force */
664             tx               = _mm256_mul_ps(fscal,dx00);
665             ty               = _mm256_mul_ps(fscal,dy00);
666             tz               = _mm256_mul_ps(fscal,dz00);
667
668             /* Update vectorial force */
669             fix0             = _mm256_add_ps(fix0,tx);
670             fiy0             = _mm256_add_ps(fiy0,ty);
671             fiz0             = _mm256_add_ps(fiz0,tz);
672
673             fjptrA             = f+j_coord_offsetA;
674             fjptrB             = f+j_coord_offsetB;
675             fjptrC             = f+j_coord_offsetC;
676             fjptrD             = f+j_coord_offsetD;
677             fjptrE             = f+j_coord_offsetE;
678             fjptrF             = f+j_coord_offsetF;
679             fjptrG             = f+j_coord_offsetG;
680             fjptrH             = f+j_coord_offsetH;
681             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
682
683             /* Inner loop uses 46 flops */
684         }
685
686         if(jidx<j_index_end)
687         {
688
689             /* Get j neighbor index, and coordinate index */
690             jnrlistA         = jjnr[jidx];
691             jnrlistB         = jjnr[jidx+1];
692             jnrlistC         = jjnr[jidx+2];
693             jnrlistD         = jjnr[jidx+3];
694             jnrlistE         = jjnr[jidx+4];
695             jnrlistF         = jjnr[jidx+5];
696             jnrlistG         = jjnr[jidx+6];
697             jnrlistH         = jjnr[jidx+7];
698             /* Sign of each element will be negative for non-real atoms.
699              * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
700              * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
701              */
702             dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
703                                             gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
704                                             
705             jnrA       = (jnrlistA>=0) ? jnrlistA : 0;
706             jnrB       = (jnrlistB>=0) ? jnrlistB : 0;
707             jnrC       = (jnrlistC>=0) ? jnrlistC : 0;
708             jnrD       = (jnrlistD>=0) ? jnrlistD : 0;
709             jnrE       = (jnrlistE>=0) ? jnrlistE : 0;
710             jnrF       = (jnrlistF>=0) ? jnrlistF : 0;
711             jnrG       = (jnrlistG>=0) ? jnrlistG : 0;
712             jnrH       = (jnrlistH>=0) ? jnrlistH : 0;
713             j_coord_offsetA  = DIM*jnrA;
714             j_coord_offsetB  = DIM*jnrB;
715             j_coord_offsetC  = DIM*jnrC;
716             j_coord_offsetD  = DIM*jnrD;
717             j_coord_offsetE  = DIM*jnrE;
718             j_coord_offsetF  = DIM*jnrF;
719             j_coord_offsetG  = DIM*jnrG;
720             j_coord_offsetH  = DIM*jnrH;
721
722             /* load j atom coordinates */
723             gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
724                                                  x+j_coord_offsetC,x+j_coord_offsetD,
725                                                  x+j_coord_offsetE,x+j_coord_offsetF,
726                                                  x+j_coord_offsetG,x+j_coord_offsetH,
727                                                  &jx0,&jy0,&jz0);
728
729             /* Calculate displacement vector */
730             dx00             = _mm256_sub_ps(ix0,jx0);
731             dy00             = _mm256_sub_ps(iy0,jy0);
732             dz00             = _mm256_sub_ps(iz0,jz0);
733
734             /* Calculate squared distance and things based on it */
735             rsq00            = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
736
737             rinv00           = avx256_invsqrt_f(rsq00);
738
739             rinvsq00         = _mm256_mul_ps(rinv00,rinv00);
740
741             /* Load parameters for j particles */
742             vdwjidx0A        = 2*vdwtype[jnrA+0];
743             vdwjidx0B        = 2*vdwtype[jnrB+0];
744             vdwjidx0C        = 2*vdwtype[jnrC+0];
745             vdwjidx0D        = 2*vdwtype[jnrD+0];
746             vdwjidx0E        = 2*vdwtype[jnrE+0];
747             vdwjidx0F        = 2*vdwtype[jnrF+0];
748             vdwjidx0G        = 2*vdwtype[jnrG+0];
749             vdwjidx0H        = 2*vdwtype[jnrH+0];
750
751             /**************************
752              * CALCULATE INTERACTIONS *
753              **************************/
754
755             r00              = _mm256_mul_ps(rsq00,rinv00);
756             r00              = _mm256_andnot_ps(dummy_mask,r00);
757
758             /* Compute parameters for interactions between i and j atoms */
759             gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
760                                             vdwioffsetptr0+vdwjidx0B,
761                                             vdwioffsetptr0+vdwjidx0C,
762                                             vdwioffsetptr0+vdwjidx0D,
763                                             vdwioffsetptr0+vdwjidx0E,
764                                             vdwioffsetptr0+vdwjidx0F,
765                                             vdwioffsetptr0+vdwjidx0G,
766                                             vdwioffsetptr0+vdwjidx0H,
767                                             &c6_00,&c12_00);
768
769             c6grid_00       = gmx_mm256_load_8real_swizzle_ps(vdwgridioffsetptr0+vdwjidx0A,
770                                                                   vdwgridioffsetptr0+vdwjidx0B,
771                                                                   vdwgridioffsetptr0+vdwjidx0C,
772                                                                   vdwgridioffsetptr0+vdwjidx0D,
773                                                                   vdwgridioffsetptr0+vdwjidx0E,
774                                                                   vdwgridioffsetptr0+vdwjidx0F,
775                                                                   vdwgridioffsetptr0+vdwjidx0G,
776                                                                   vdwgridioffsetptr0+vdwjidx0H);
777
778             /* Analytical LJ-PME */
779             rinvsix          = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
780             ewcljrsq         = _mm256_mul_ps(ewclj2,rsq00);
781             ewclj6           = _mm256_mul_ps(ewclj2,_mm256_mul_ps(ewclj2,ewclj2));
782             exponent         = avx256_exp_f(ewcljrsq);
783             /* poly = exp(-(beta*r)^2) * (1 + (beta*r)^2 + (beta*r)^4 /2) */
784             poly             = _mm256_mul_ps(exponent,_mm256_add_ps(_mm256_sub_ps(one,ewcljrsq),_mm256_mul_ps(_mm256_mul_ps(ewcljrsq,ewcljrsq),one_half)));
785             /* f6A = 6 * C6grid * (1 - poly) */
786             f6A              = _mm256_mul_ps(c6grid_00,_mm256_sub_ps(one,poly));
787             /* f6B = C6grid * exponent * beta^6 */
788             f6B              = _mm256_mul_ps(_mm256_mul_ps(c6grid_00,one_sixth),_mm256_mul_ps(exponent,ewclj6));
789             /* fvdw = 12*C12/r13 - ((6*C6 - f6A)/r6 + f6B)/r */
790             fvdw              = _mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),_mm256_sub_ps(c6_00,f6A)),rinvsix),f6B),rinvsq00);
791
792             fscal            = fvdw;
793
794             fscal            = _mm256_andnot_ps(dummy_mask,fscal);
795
796             /* Calculate temporary vectorial force */
797             tx               = _mm256_mul_ps(fscal,dx00);
798             ty               = _mm256_mul_ps(fscal,dy00);
799             tz               = _mm256_mul_ps(fscal,dz00);
800
801             /* Update vectorial force */
802             fix0             = _mm256_add_ps(fix0,tx);
803             fiy0             = _mm256_add_ps(fiy0,ty);
804             fiz0             = _mm256_add_ps(fiz0,tz);
805
806             fjptrA             = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
807             fjptrB             = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
808             fjptrC             = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
809             fjptrD             = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
810             fjptrE             = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
811             fjptrF             = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
812             fjptrG             = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
813             fjptrH             = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
814             gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
815
816             /* Inner loop uses 47 flops */
817         }
818
819         /* End of innermost loop */
820
821         gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
822                                                  f+i_coord_offset,fshift+i_shift_offset);
823
824         /* Increment number of inner iterations */
825         inneriter                  += j_index_end - j_index_start;
826
827         /* Outer loop uses 6 flops */
828     }
829
830     /* Increment number of outer iterations */
831     outeriter        += nri;
832
833     /* Update outer/inner flops */
834
835     inc_nrnb(nrnb,eNR_NBKERNEL_VDW_F,outeriter*6 + inneriter*47);
836 }