2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_single
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrE,jnrF,jnrG,jnrH;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 real * vdwioffsetptr0;
86 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 real * vdwioffsetptr1;
88 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 real * vdwioffsetptr2;
90 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 real * vdwioffsetptr3;
92 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
93 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
94 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
95 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
96 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
97 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
98 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
99 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
102 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
105 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
106 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
108 __m128i vfitab_lo,vfitab_hi;
109 __m128i ifour = _mm_set1_epi32(4);
110 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
112 __m256 dummy_mask,cutoff_mask;
113 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
114 __m256 one = _mm256_set1_ps(1.0);
115 __m256 two = _mm256_set1_ps(2.0);
121 jindex = nlist->jindex;
123 shiftidx = nlist->shift;
125 shiftvec = fr->shift_vec[0];
126 fshift = fr->fshift[0];
127 facel = _mm256_set1_ps(fr->ic->epsfac);
128 charge = mdatoms->chargeA;
129 nvdwtype = fr->ntype;
131 vdwtype = mdatoms->typeA;
133 vftab = kernel_data->table_elec->data;
134 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
136 /* Setup water-specific parameters */
137 inr = nlist->iinr[0];
138 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
139 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
140 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
141 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
143 /* Avoid stupid compiler warnings */
144 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
157 for(iidx=0;iidx<4*DIM;iidx++)
162 /* Start outer loop over neighborlists */
163 for(iidx=0; iidx<nri; iidx++)
165 /* Load shift vector for this list */
166 i_shift_offset = DIM*shiftidx[iidx];
168 /* Load limits for loop over neighbors */
169 j_index_start = jindex[iidx];
170 j_index_end = jindex[iidx+1];
172 /* Get outer coordinate index */
174 i_coord_offset = DIM*inr;
176 /* Load i particle coords and add shift vector */
177 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
178 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
180 fix0 = _mm256_setzero_ps();
181 fiy0 = _mm256_setzero_ps();
182 fiz0 = _mm256_setzero_ps();
183 fix1 = _mm256_setzero_ps();
184 fiy1 = _mm256_setzero_ps();
185 fiz1 = _mm256_setzero_ps();
186 fix2 = _mm256_setzero_ps();
187 fiy2 = _mm256_setzero_ps();
188 fiz2 = _mm256_setzero_ps();
189 fix3 = _mm256_setzero_ps();
190 fiy3 = _mm256_setzero_ps();
191 fiz3 = _mm256_setzero_ps();
193 /* Reset potential sums */
194 velecsum = _mm256_setzero_ps();
195 vvdwsum = _mm256_setzero_ps();
197 /* Start inner kernel loop */
198 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
201 /* Get j neighbor index, and coordinate index */
210 j_coord_offsetA = DIM*jnrA;
211 j_coord_offsetB = DIM*jnrB;
212 j_coord_offsetC = DIM*jnrC;
213 j_coord_offsetD = DIM*jnrD;
214 j_coord_offsetE = DIM*jnrE;
215 j_coord_offsetF = DIM*jnrF;
216 j_coord_offsetG = DIM*jnrG;
217 j_coord_offsetH = DIM*jnrH;
219 /* load j atom coordinates */
220 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
221 x+j_coord_offsetC,x+j_coord_offsetD,
222 x+j_coord_offsetE,x+j_coord_offsetF,
223 x+j_coord_offsetG,x+j_coord_offsetH,
226 /* Calculate displacement vector */
227 dx00 = _mm256_sub_ps(ix0,jx0);
228 dy00 = _mm256_sub_ps(iy0,jy0);
229 dz00 = _mm256_sub_ps(iz0,jz0);
230 dx10 = _mm256_sub_ps(ix1,jx0);
231 dy10 = _mm256_sub_ps(iy1,jy0);
232 dz10 = _mm256_sub_ps(iz1,jz0);
233 dx20 = _mm256_sub_ps(ix2,jx0);
234 dy20 = _mm256_sub_ps(iy2,jy0);
235 dz20 = _mm256_sub_ps(iz2,jz0);
236 dx30 = _mm256_sub_ps(ix3,jx0);
237 dy30 = _mm256_sub_ps(iy3,jy0);
238 dz30 = _mm256_sub_ps(iz3,jz0);
240 /* Calculate squared distance and things based on it */
241 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
242 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
243 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
244 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
246 rinv10 = avx256_invsqrt_f(rsq10);
247 rinv20 = avx256_invsqrt_f(rsq20);
248 rinv30 = avx256_invsqrt_f(rsq30);
250 rinvsq00 = avx256_inv_f(rsq00);
252 /* Load parameters for j particles */
253 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
254 charge+jnrC+0,charge+jnrD+0,
255 charge+jnrE+0,charge+jnrF+0,
256 charge+jnrG+0,charge+jnrH+0);
257 vdwjidx0A = 2*vdwtype[jnrA+0];
258 vdwjidx0B = 2*vdwtype[jnrB+0];
259 vdwjidx0C = 2*vdwtype[jnrC+0];
260 vdwjidx0D = 2*vdwtype[jnrD+0];
261 vdwjidx0E = 2*vdwtype[jnrE+0];
262 vdwjidx0F = 2*vdwtype[jnrF+0];
263 vdwjidx0G = 2*vdwtype[jnrG+0];
264 vdwjidx0H = 2*vdwtype[jnrH+0];
266 fjx0 = _mm256_setzero_ps();
267 fjy0 = _mm256_setzero_ps();
268 fjz0 = _mm256_setzero_ps();
270 /**************************
271 * CALCULATE INTERACTIONS *
272 **************************/
274 /* Compute parameters for interactions between i and j atoms */
275 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
276 vdwioffsetptr0+vdwjidx0B,
277 vdwioffsetptr0+vdwjidx0C,
278 vdwioffsetptr0+vdwjidx0D,
279 vdwioffsetptr0+vdwjidx0E,
280 vdwioffsetptr0+vdwjidx0F,
281 vdwioffsetptr0+vdwjidx0G,
282 vdwioffsetptr0+vdwjidx0H,
285 /* LENNARD-JONES DISPERSION/REPULSION */
287 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
288 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
289 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
290 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
291 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
293 /* Update potential sum for this i atom from the interaction with this j atom. */
294 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
298 /* Calculate temporary vectorial force */
299 tx = _mm256_mul_ps(fscal,dx00);
300 ty = _mm256_mul_ps(fscal,dy00);
301 tz = _mm256_mul_ps(fscal,dz00);
303 /* Update vectorial force */
304 fix0 = _mm256_add_ps(fix0,tx);
305 fiy0 = _mm256_add_ps(fiy0,ty);
306 fiz0 = _mm256_add_ps(fiz0,tz);
308 fjx0 = _mm256_add_ps(fjx0,tx);
309 fjy0 = _mm256_add_ps(fjy0,ty);
310 fjz0 = _mm256_add_ps(fjz0,tz);
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 r10 = _mm256_mul_ps(rsq10,rinv10);
318 /* Compute parameters for interactions between i and j atoms */
319 qq10 = _mm256_mul_ps(iq1,jq0);
321 /* Calculate table index by multiplying r with table scale and truncate to integer */
322 rt = _mm256_mul_ps(r10,vftabscale);
323 vfitab = _mm256_cvttps_epi32(rt);
324 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
325 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
326 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
327 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
328 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
329 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
331 /* CUBIC SPLINE TABLE ELECTROSTATICS */
332 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
333 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
334 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
335 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
336 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
337 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
338 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
339 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
340 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
341 Heps = _mm256_mul_ps(vfeps,H);
342 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
343 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
344 velec = _mm256_mul_ps(qq10,VV);
345 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
346 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
348 /* Update potential sum for this i atom from the interaction with this j atom. */
349 velecsum = _mm256_add_ps(velecsum,velec);
353 /* Calculate temporary vectorial force */
354 tx = _mm256_mul_ps(fscal,dx10);
355 ty = _mm256_mul_ps(fscal,dy10);
356 tz = _mm256_mul_ps(fscal,dz10);
358 /* Update vectorial force */
359 fix1 = _mm256_add_ps(fix1,tx);
360 fiy1 = _mm256_add_ps(fiy1,ty);
361 fiz1 = _mm256_add_ps(fiz1,tz);
363 fjx0 = _mm256_add_ps(fjx0,tx);
364 fjy0 = _mm256_add_ps(fjy0,ty);
365 fjz0 = _mm256_add_ps(fjz0,tz);
367 /**************************
368 * CALCULATE INTERACTIONS *
369 **************************/
371 r20 = _mm256_mul_ps(rsq20,rinv20);
373 /* Compute parameters for interactions between i and j atoms */
374 qq20 = _mm256_mul_ps(iq2,jq0);
376 /* Calculate table index by multiplying r with table scale and truncate to integer */
377 rt = _mm256_mul_ps(r20,vftabscale);
378 vfitab = _mm256_cvttps_epi32(rt);
379 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
380 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
381 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
382 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
383 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
384 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
386 /* CUBIC SPLINE TABLE ELECTROSTATICS */
387 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
388 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
389 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
390 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
391 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
392 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
393 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
394 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
395 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
396 Heps = _mm256_mul_ps(vfeps,H);
397 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
398 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
399 velec = _mm256_mul_ps(qq20,VV);
400 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
401 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
403 /* Update potential sum for this i atom from the interaction with this j atom. */
404 velecsum = _mm256_add_ps(velecsum,velec);
408 /* Calculate temporary vectorial force */
409 tx = _mm256_mul_ps(fscal,dx20);
410 ty = _mm256_mul_ps(fscal,dy20);
411 tz = _mm256_mul_ps(fscal,dz20);
413 /* Update vectorial force */
414 fix2 = _mm256_add_ps(fix2,tx);
415 fiy2 = _mm256_add_ps(fiy2,ty);
416 fiz2 = _mm256_add_ps(fiz2,tz);
418 fjx0 = _mm256_add_ps(fjx0,tx);
419 fjy0 = _mm256_add_ps(fjy0,ty);
420 fjz0 = _mm256_add_ps(fjz0,tz);
422 /**************************
423 * CALCULATE INTERACTIONS *
424 **************************/
426 r30 = _mm256_mul_ps(rsq30,rinv30);
428 /* Compute parameters for interactions between i and j atoms */
429 qq30 = _mm256_mul_ps(iq3,jq0);
431 /* Calculate table index by multiplying r with table scale and truncate to integer */
432 rt = _mm256_mul_ps(r30,vftabscale);
433 vfitab = _mm256_cvttps_epi32(rt);
434 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
435 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
436 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
437 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
438 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
439 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
441 /* CUBIC SPLINE TABLE ELECTROSTATICS */
442 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
443 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
444 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
445 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
446 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
447 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
448 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
449 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
450 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
451 Heps = _mm256_mul_ps(vfeps,H);
452 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
453 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
454 velec = _mm256_mul_ps(qq30,VV);
455 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
456 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq30,FF),_mm256_mul_ps(vftabscale,rinv30)));
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 velecsum = _mm256_add_ps(velecsum,velec);
463 /* Calculate temporary vectorial force */
464 tx = _mm256_mul_ps(fscal,dx30);
465 ty = _mm256_mul_ps(fscal,dy30);
466 tz = _mm256_mul_ps(fscal,dz30);
468 /* Update vectorial force */
469 fix3 = _mm256_add_ps(fix3,tx);
470 fiy3 = _mm256_add_ps(fiy3,ty);
471 fiz3 = _mm256_add_ps(fiz3,tz);
473 fjx0 = _mm256_add_ps(fjx0,tx);
474 fjy0 = _mm256_add_ps(fjy0,ty);
475 fjz0 = _mm256_add_ps(fjz0,tz);
477 fjptrA = f+j_coord_offsetA;
478 fjptrB = f+j_coord_offsetB;
479 fjptrC = f+j_coord_offsetC;
480 fjptrD = f+j_coord_offsetD;
481 fjptrE = f+j_coord_offsetE;
482 fjptrF = f+j_coord_offsetF;
483 fjptrG = f+j_coord_offsetG;
484 fjptrH = f+j_coord_offsetH;
486 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
488 /* Inner loop uses 164 flops */
494 /* Get j neighbor index, and coordinate index */
495 jnrlistA = jjnr[jidx];
496 jnrlistB = jjnr[jidx+1];
497 jnrlistC = jjnr[jidx+2];
498 jnrlistD = jjnr[jidx+3];
499 jnrlistE = jjnr[jidx+4];
500 jnrlistF = jjnr[jidx+5];
501 jnrlistG = jjnr[jidx+6];
502 jnrlistH = jjnr[jidx+7];
503 /* Sign of each element will be negative for non-real atoms.
504 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
505 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
507 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
508 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
510 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
511 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
512 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
513 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
514 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
515 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
516 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
517 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
518 j_coord_offsetA = DIM*jnrA;
519 j_coord_offsetB = DIM*jnrB;
520 j_coord_offsetC = DIM*jnrC;
521 j_coord_offsetD = DIM*jnrD;
522 j_coord_offsetE = DIM*jnrE;
523 j_coord_offsetF = DIM*jnrF;
524 j_coord_offsetG = DIM*jnrG;
525 j_coord_offsetH = DIM*jnrH;
527 /* load j atom coordinates */
528 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
529 x+j_coord_offsetC,x+j_coord_offsetD,
530 x+j_coord_offsetE,x+j_coord_offsetF,
531 x+j_coord_offsetG,x+j_coord_offsetH,
534 /* Calculate displacement vector */
535 dx00 = _mm256_sub_ps(ix0,jx0);
536 dy00 = _mm256_sub_ps(iy0,jy0);
537 dz00 = _mm256_sub_ps(iz0,jz0);
538 dx10 = _mm256_sub_ps(ix1,jx0);
539 dy10 = _mm256_sub_ps(iy1,jy0);
540 dz10 = _mm256_sub_ps(iz1,jz0);
541 dx20 = _mm256_sub_ps(ix2,jx0);
542 dy20 = _mm256_sub_ps(iy2,jy0);
543 dz20 = _mm256_sub_ps(iz2,jz0);
544 dx30 = _mm256_sub_ps(ix3,jx0);
545 dy30 = _mm256_sub_ps(iy3,jy0);
546 dz30 = _mm256_sub_ps(iz3,jz0);
548 /* Calculate squared distance and things based on it */
549 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
550 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
551 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
552 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
554 rinv10 = avx256_invsqrt_f(rsq10);
555 rinv20 = avx256_invsqrt_f(rsq20);
556 rinv30 = avx256_invsqrt_f(rsq30);
558 rinvsq00 = avx256_inv_f(rsq00);
560 /* Load parameters for j particles */
561 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
562 charge+jnrC+0,charge+jnrD+0,
563 charge+jnrE+0,charge+jnrF+0,
564 charge+jnrG+0,charge+jnrH+0);
565 vdwjidx0A = 2*vdwtype[jnrA+0];
566 vdwjidx0B = 2*vdwtype[jnrB+0];
567 vdwjidx0C = 2*vdwtype[jnrC+0];
568 vdwjidx0D = 2*vdwtype[jnrD+0];
569 vdwjidx0E = 2*vdwtype[jnrE+0];
570 vdwjidx0F = 2*vdwtype[jnrF+0];
571 vdwjidx0G = 2*vdwtype[jnrG+0];
572 vdwjidx0H = 2*vdwtype[jnrH+0];
574 fjx0 = _mm256_setzero_ps();
575 fjy0 = _mm256_setzero_ps();
576 fjz0 = _mm256_setzero_ps();
578 /**************************
579 * CALCULATE INTERACTIONS *
580 **************************/
582 /* Compute parameters for interactions between i and j atoms */
583 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
584 vdwioffsetptr0+vdwjidx0B,
585 vdwioffsetptr0+vdwjidx0C,
586 vdwioffsetptr0+vdwjidx0D,
587 vdwioffsetptr0+vdwjidx0E,
588 vdwioffsetptr0+vdwjidx0F,
589 vdwioffsetptr0+vdwjidx0G,
590 vdwioffsetptr0+vdwjidx0H,
593 /* LENNARD-JONES DISPERSION/REPULSION */
595 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
596 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
597 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
598 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
599 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
601 /* Update potential sum for this i atom from the interaction with this j atom. */
602 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
603 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
607 fscal = _mm256_andnot_ps(dummy_mask,fscal);
609 /* Calculate temporary vectorial force */
610 tx = _mm256_mul_ps(fscal,dx00);
611 ty = _mm256_mul_ps(fscal,dy00);
612 tz = _mm256_mul_ps(fscal,dz00);
614 /* Update vectorial force */
615 fix0 = _mm256_add_ps(fix0,tx);
616 fiy0 = _mm256_add_ps(fiy0,ty);
617 fiz0 = _mm256_add_ps(fiz0,tz);
619 fjx0 = _mm256_add_ps(fjx0,tx);
620 fjy0 = _mm256_add_ps(fjy0,ty);
621 fjz0 = _mm256_add_ps(fjz0,tz);
623 /**************************
624 * CALCULATE INTERACTIONS *
625 **************************/
627 r10 = _mm256_mul_ps(rsq10,rinv10);
628 r10 = _mm256_andnot_ps(dummy_mask,r10);
630 /* Compute parameters for interactions between i and j atoms */
631 qq10 = _mm256_mul_ps(iq1,jq0);
633 /* Calculate table index by multiplying r with table scale and truncate to integer */
634 rt = _mm256_mul_ps(r10,vftabscale);
635 vfitab = _mm256_cvttps_epi32(rt);
636 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
637 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
638 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
639 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
640 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
641 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
643 /* CUBIC SPLINE TABLE ELECTROSTATICS */
644 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
645 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
646 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
647 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
648 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
649 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
650 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
651 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
652 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
653 Heps = _mm256_mul_ps(vfeps,H);
654 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
655 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
656 velec = _mm256_mul_ps(qq10,VV);
657 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
658 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
660 /* Update potential sum for this i atom from the interaction with this j atom. */
661 velec = _mm256_andnot_ps(dummy_mask,velec);
662 velecsum = _mm256_add_ps(velecsum,velec);
666 fscal = _mm256_andnot_ps(dummy_mask,fscal);
668 /* Calculate temporary vectorial force */
669 tx = _mm256_mul_ps(fscal,dx10);
670 ty = _mm256_mul_ps(fscal,dy10);
671 tz = _mm256_mul_ps(fscal,dz10);
673 /* Update vectorial force */
674 fix1 = _mm256_add_ps(fix1,tx);
675 fiy1 = _mm256_add_ps(fiy1,ty);
676 fiz1 = _mm256_add_ps(fiz1,tz);
678 fjx0 = _mm256_add_ps(fjx0,tx);
679 fjy0 = _mm256_add_ps(fjy0,ty);
680 fjz0 = _mm256_add_ps(fjz0,tz);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 r20 = _mm256_mul_ps(rsq20,rinv20);
687 r20 = _mm256_andnot_ps(dummy_mask,r20);
689 /* Compute parameters for interactions between i and j atoms */
690 qq20 = _mm256_mul_ps(iq2,jq0);
692 /* Calculate table index by multiplying r with table scale and truncate to integer */
693 rt = _mm256_mul_ps(r20,vftabscale);
694 vfitab = _mm256_cvttps_epi32(rt);
695 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
696 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
697 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
698 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
699 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
700 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
702 /* CUBIC SPLINE TABLE ELECTROSTATICS */
703 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
704 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
705 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
706 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
707 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
708 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
709 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
710 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
711 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
712 Heps = _mm256_mul_ps(vfeps,H);
713 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
714 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
715 velec = _mm256_mul_ps(qq20,VV);
716 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
717 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
719 /* Update potential sum for this i atom from the interaction with this j atom. */
720 velec = _mm256_andnot_ps(dummy_mask,velec);
721 velecsum = _mm256_add_ps(velecsum,velec);
725 fscal = _mm256_andnot_ps(dummy_mask,fscal);
727 /* Calculate temporary vectorial force */
728 tx = _mm256_mul_ps(fscal,dx20);
729 ty = _mm256_mul_ps(fscal,dy20);
730 tz = _mm256_mul_ps(fscal,dz20);
732 /* Update vectorial force */
733 fix2 = _mm256_add_ps(fix2,tx);
734 fiy2 = _mm256_add_ps(fiy2,ty);
735 fiz2 = _mm256_add_ps(fiz2,tz);
737 fjx0 = _mm256_add_ps(fjx0,tx);
738 fjy0 = _mm256_add_ps(fjy0,ty);
739 fjz0 = _mm256_add_ps(fjz0,tz);
741 /**************************
742 * CALCULATE INTERACTIONS *
743 **************************/
745 r30 = _mm256_mul_ps(rsq30,rinv30);
746 r30 = _mm256_andnot_ps(dummy_mask,r30);
748 /* Compute parameters for interactions between i and j atoms */
749 qq30 = _mm256_mul_ps(iq3,jq0);
751 /* Calculate table index by multiplying r with table scale and truncate to integer */
752 rt = _mm256_mul_ps(r30,vftabscale);
753 vfitab = _mm256_cvttps_epi32(rt);
754 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
755 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
756 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
757 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
758 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
759 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
761 /* CUBIC SPLINE TABLE ELECTROSTATICS */
762 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
763 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
764 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
765 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
766 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
767 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
768 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
769 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
770 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
771 Heps = _mm256_mul_ps(vfeps,H);
772 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
773 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
774 velec = _mm256_mul_ps(qq30,VV);
775 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
776 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq30,FF),_mm256_mul_ps(vftabscale,rinv30)));
778 /* Update potential sum for this i atom from the interaction with this j atom. */
779 velec = _mm256_andnot_ps(dummy_mask,velec);
780 velecsum = _mm256_add_ps(velecsum,velec);
784 fscal = _mm256_andnot_ps(dummy_mask,fscal);
786 /* Calculate temporary vectorial force */
787 tx = _mm256_mul_ps(fscal,dx30);
788 ty = _mm256_mul_ps(fscal,dy30);
789 tz = _mm256_mul_ps(fscal,dz30);
791 /* Update vectorial force */
792 fix3 = _mm256_add_ps(fix3,tx);
793 fiy3 = _mm256_add_ps(fiy3,ty);
794 fiz3 = _mm256_add_ps(fiz3,tz);
796 fjx0 = _mm256_add_ps(fjx0,tx);
797 fjy0 = _mm256_add_ps(fjy0,ty);
798 fjz0 = _mm256_add_ps(fjz0,tz);
800 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
801 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
802 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
803 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
804 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
805 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
806 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
807 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
809 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
811 /* Inner loop uses 167 flops */
814 /* End of innermost loop */
816 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
817 f+i_coord_offset,fshift+i_shift_offset);
820 /* Update potential energies */
821 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
822 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
824 /* Increment number of inner iterations */
825 inneriter += j_index_end - j_index_start;
827 /* Outer loop uses 26 flops */
830 /* Increment number of outer iterations */
833 /* Update outer/inner flops */
835 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*167);
838 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_single
839 * Electrostatics interaction: CubicSplineTable
840 * VdW interaction: LennardJones
841 * Geometry: Water4-Particle
842 * Calculate force/pot: Force
845 nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_single
846 (t_nblist * gmx_restrict nlist,
847 rvec * gmx_restrict xx,
848 rvec * gmx_restrict ff,
849 struct t_forcerec * gmx_restrict fr,
850 t_mdatoms * gmx_restrict mdatoms,
851 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
852 t_nrnb * gmx_restrict nrnb)
854 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
855 * just 0 for non-waters.
856 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
857 * jnr indices corresponding to data put in the four positions in the SIMD register.
859 int i_shift_offset,i_coord_offset,outeriter,inneriter;
860 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
861 int jnrA,jnrB,jnrC,jnrD;
862 int jnrE,jnrF,jnrG,jnrH;
863 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
864 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
865 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
866 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
867 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
869 real *shiftvec,*fshift,*x,*f;
870 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
872 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
873 real * vdwioffsetptr0;
874 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
875 real * vdwioffsetptr1;
876 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
877 real * vdwioffsetptr2;
878 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
879 real * vdwioffsetptr3;
880 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
881 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
882 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
883 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
884 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
885 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
886 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
887 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
890 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
893 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
894 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
896 __m128i vfitab_lo,vfitab_hi;
897 __m128i ifour = _mm_set1_epi32(4);
898 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
900 __m256 dummy_mask,cutoff_mask;
901 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
902 __m256 one = _mm256_set1_ps(1.0);
903 __m256 two = _mm256_set1_ps(2.0);
909 jindex = nlist->jindex;
911 shiftidx = nlist->shift;
913 shiftvec = fr->shift_vec[0];
914 fshift = fr->fshift[0];
915 facel = _mm256_set1_ps(fr->ic->epsfac);
916 charge = mdatoms->chargeA;
917 nvdwtype = fr->ntype;
919 vdwtype = mdatoms->typeA;
921 vftab = kernel_data->table_elec->data;
922 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
924 /* Setup water-specific parameters */
925 inr = nlist->iinr[0];
926 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
927 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
928 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
929 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
931 /* Avoid stupid compiler warnings */
932 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
945 for(iidx=0;iidx<4*DIM;iidx++)
950 /* Start outer loop over neighborlists */
951 for(iidx=0; iidx<nri; iidx++)
953 /* Load shift vector for this list */
954 i_shift_offset = DIM*shiftidx[iidx];
956 /* Load limits for loop over neighbors */
957 j_index_start = jindex[iidx];
958 j_index_end = jindex[iidx+1];
960 /* Get outer coordinate index */
962 i_coord_offset = DIM*inr;
964 /* Load i particle coords and add shift vector */
965 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
966 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
968 fix0 = _mm256_setzero_ps();
969 fiy0 = _mm256_setzero_ps();
970 fiz0 = _mm256_setzero_ps();
971 fix1 = _mm256_setzero_ps();
972 fiy1 = _mm256_setzero_ps();
973 fiz1 = _mm256_setzero_ps();
974 fix2 = _mm256_setzero_ps();
975 fiy2 = _mm256_setzero_ps();
976 fiz2 = _mm256_setzero_ps();
977 fix3 = _mm256_setzero_ps();
978 fiy3 = _mm256_setzero_ps();
979 fiz3 = _mm256_setzero_ps();
981 /* Start inner kernel loop */
982 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
985 /* Get j neighbor index, and coordinate index */
994 j_coord_offsetA = DIM*jnrA;
995 j_coord_offsetB = DIM*jnrB;
996 j_coord_offsetC = DIM*jnrC;
997 j_coord_offsetD = DIM*jnrD;
998 j_coord_offsetE = DIM*jnrE;
999 j_coord_offsetF = DIM*jnrF;
1000 j_coord_offsetG = DIM*jnrG;
1001 j_coord_offsetH = DIM*jnrH;
1003 /* load j atom coordinates */
1004 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1005 x+j_coord_offsetC,x+j_coord_offsetD,
1006 x+j_coord_offsetE,x+j_coord_offsetF,
1007 x+j_coord_offsetG,x+j_coord_offsetH,
1010 /* Calculate displacement vector */
1011 dx00 = _mm256_sub_ps(ix0,jx0);
1012 dy00 = _mm256_sub_ps(iy0,jy0);
1013 dz00 = _mm256_sub_ps(iz0,jz0);
1014 dx10 = _mm256_sub_ps(ix1,jx0);
1015 dy10 = _mm256_sub_ps(iy1,jy0);
1016 dz10 = _mm256_sub_ps(iz1,jz0);
1017 dx20 = _mm256_sub_ps(ix2,jx0);
1018 dy20 = _mm256_sub_ps(iy2,jy0);
1019 dz20 = _mm256_sub_ps(iz2,jz0);
1020 dx30 = _mm256_sub_ps(ix3,jx0);
1021 dy30 = _mm256_sub_ps(iy3,jy0);
1022 dz30 = _mm256_sub_ps(iz3,jz0);
1024 /* Calculate squared distance and things based on it */
1025 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1026 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1027 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1028 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1030 rinv10 = avx256_invsqrt_f(rsq10);
1031 rinv20 = avx256_invsqrt_f(rsq20);
1032 rinv30 = avx256_invsqrt_f(rsq30);
1034 rinvsq00 = avx256_inv_f(rsq00);
1036 /* Load parameters for j particles */
1037 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1038 charge+jnrC+0,charge+jnrD+0,
1039 charge+jnrE+0,charge+jnrF+0,
1040 charge+jnrG+0,charge+jnrH+0);
1041 vdwjidx0A = 2*vdwtype[jnrA+0];
1042 vdwjidx0B = 2*vdwtype[jnrB+0];
1043 vdwjidx0C = 2*vdwtype[jnrC+0];
1044 vdwjidx0D = 2*vdwtype[jnrD+0];
1045 vdwjidx0E = 2*vdwtype[jnrE+0];
1046 vdwjidx0F = 2*vdwtype[jnrF+0];
1047 vdwjidx0G = 2*vdwtype[jnrG+0];
1048 vdwjidx0H = 2*vdwtype[jnrH+0];
1050 fjx0 = _mm256_setzero_ps();
1051 fjy0 = _mm256_setzero_ps();
1052 fjz0 = _mm256_setzero_ps();
1054 /**************************
1055 * CALCULATE INTERACTIONS *
1056 **************************/
1058 /* Compute parameters for interactions between i and j atoms */
1059 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1060 vdwioffsetptr0+vdwjidx0B,
1061 vdwioffsetptr0+vdwjidx0C,
1062 vdwioffsetptr0+vdwjidx0D,
1063 vdwioffsetptr0+vdwjidx0E,
1064 vdwioffsetptr0+vdwjidx0F,
1065 vdwioffsetptr0+vdwjidx0G,
1066 vdwioffsetptr0+vdwjidx0H,
1069 /* LENNARD-JONES DISPERSION/REPULSION */
1071 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1072 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1076 /* Calculate temporary vectorial force */
1077 tx = _mm256_mul_ps(fscal,dx00);
1078 ty = _mm256_mul_ps(fscal,dy00);
1079 tz = _mm256_mul_ps(fscal,dz00);
1081 /* Update vectorial force */
1082 fix0 = _mm256_add_ps(fix0,tx);
1083 fiy0 = _mm256_add_ps(fiy0,ty);
1084 fiz0 = _mm256_add_ps(fiz0,tz);
1086 fjx0 = _mm256_add_ps(fjx0,tx);
1087 fjy0 = _mm256_add_ps(fjy0,ty);
1088 fjz0 = _mm256_add_ps(fjz0,tz);
1090 /**************************
1091 * CALCULATE INTERACTIONS *
1092 **************************/
1094 r10 = _mm256_mul_ps(rsq10,rinv10);
1096 /* Compute parameters for interactions between i and j atoms */
1097 qq10 = _mm256_mul_ps(iq1,jq0);
1099 /* Calculate table index by multiplying r with table scale and truncate to integer */
1100 rt = _mm256_mul_ps(r10,vftabscale);
1101 vfitab = _mm256_cvttps_epi32(rt);
1102 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1103 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1104 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1105 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1106 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1107 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1109 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1110 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1111 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1112 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1113 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1114 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1115 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1116 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1117 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1118 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1119 Heps = _mm256_mul_ps(vfeps,H);
1120 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1121 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1122 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1126 /* Calculate temporary vectorial force */
1127 tx = _mm256_mul_ps(fscal,dx10);
1128 ty = _mm256_mul_ps(fscal,dy10);
1129 tz = _mm256_mul_ps(fscal,dz10);
1131 /* Update vectorial force */
1132 fix1 = _mm256_add_ps(fix1,tx);
1133 fiy1 = _mm256_add_ps(fiy1,ty);
1134 fiz1 = _mm256_add_ps(fiz1,tz);
1136 fjx0 = _mm256_add_ps(fjx0,tx);
1137 fjy0 = _mm256_add_ps(fjy0,ty);
1138 fjz0 = _mm256_add_ps(fjz0,tz);
1140 /**************************
1141 * CALCULATE INTERACTIONS *
1142 **************************/
1144 r20 = _mm256_mul_ps(rsq20,rinv20);
1146 /* Compute parameters for interactions between i and j atoms */
1147 qq20 = _mm256_mul_ps(iq2,jq0);
1149 /* Calculate table index by multiplying r with table scale and truncate to integer */
1150 rt = _mm256_mul_ps(r20,vftabscale);
1151 vfitab = _mm256_cvttps_epi32(rt);
1152 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1153 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1154 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1155 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1156 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1157 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1159 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1160 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1161 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1162 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1163 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1164 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1165 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1166 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1167 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1168 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1169 Heps = _mm256_mul_ps(vfeps,H);
1170 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1171 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1172 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1176 /* Calculate temporary vectorial force */
1177 tx = _mm256_mul_ps(fscal,dx20);
1178 ty = _mm256_mul_ps(fscal,dy20);
1179 tz = _mm256_mul_ps(fscal,dz20);
1181 /* Update vectorial force */
1182 fix2 = _mm256_add_ps(fix2,tx);
1183 fiy2 = _mm256_add_ps(fiy2,ty);
1184 fiz2 = _mm256_add_ps(fiz2,tz);
1186 fjx0 = _mm256_add_ps(fjx0,tx);
1187 fjy0 = _mm256_add_ps(fjy0,ty);
1188 fjz0 = _mm256_add_ps(fjz0,tz);
1190 /**************************
1191 * CALCULATE INTERACTIONS *
1192 **************************/
1194 r30 = _mm256_mul_ps(rsq30,rinv30);
1196 /* Compute parameters for interactions between i and j atoms */
1197 qq30 = _mm256_mul_ps(iq3,jq0);
1199 /* Calculate table index by multiplying r with table scale and truncate to integer */
1200 rt = _mm256_mul_ps(r30,vftabscale);
1201 vfitab = _mm256_cvttps_epi32(rt);
1202 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1203 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1204 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1205 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1206 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1207 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1209 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1210 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1211 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1212 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1213 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1214 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1215 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1216 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1217 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1218 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1219 Heps = _mm256_mul_ps(vfeps,H);
1220 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1221 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1222 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq30,FF),_mm256_mul_ps(vftabscale,rinv30)));
1226 /* Calculate temporary vectorial force */
1227 tx = _mm256_mul_ps(fscal,dx30);
1228 ty = _mm256_mul_ps(fscal,dy30);
1229 tz = _mm256_mul_ps(fscal,dz30);
1231 /* Update vectorial force */
1232 fix3 = _mm256_add_ps(fix3,tx);
1233 fiy3 = _mm256_add_ps(fiy3,ty);
1234 fiz3 = _mm256_add_ps(fiz3,tz);
1236 fjx0 = _mm256_add_ps(fjx0,tx);
1237 fjy0 = _mm256_add_ps(fjy0,ty);
1238 fjz0 = _mm256_add_ps(fjz0,tz);
1240 fjptrA = f+j_coord_offsetA;
1241 fjptrB = f+j_coord_offsetB;
1242 fjptrC = f+j_coord_offsetC;
1243 fjptrD = f+j_coord_offsetD;
1244 fjptrE = f+j_coord_offsetE;
1245 fjptrF = f+j_coord_offsetF;
1246 fjptrG = f+j_coord_offsetG;
1247 fjptrH = f+j_coord_offsetH;
1249 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1251 /* Inner loop uses 147 flops */
1254 if(jidx<j_index_end)
1257 /* Get j neighbor index, and coordinate index */
1258 jnrlistA = jjnr[jidx];
1259 jnrlistB = jjnr[jidx+1];
1260 jnrlistC = jjnr[jidx+2];
1261 jnrlistD = jjnr[jidx+3];
1262 jnrlistE = jjnr[jidx+4];
1263 jnrlistF = jjnr[jidx+5];
1264 jnrlistG = jjnr[jidx+6];
1265 jnrlistH = jjnr[jidx+7];
1266 /* Sign of each element will be negative for non-real atoms.
1267 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1268 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1270 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1271 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1273 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1274 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1275 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1276 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1277 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1278 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1279 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1280 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1281 j_coord_offsetA = DIM*jnrA;
1282 j_coord_offsetB = DIM*jnrB;
1283 j_coord_offsetC = DIM*jnrC;
1284 j_coord_offsetD = DIM*jnrD;
1285 j_coord_offsetE = DIM*jnrE;
1286 j_coord_offsetF = DIM*jnrF;
1287 j_coord_offsetG = DIM*jnrG;
1288 j_coord_offsetH = DIM*jnrH;
1290 /* load j atom coordinates */
1291 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1292 x+j_coord_offsetC,x+j_coord_offsetD,
1293 x+j_coord_offsetE,x+j_coord_offsetF,
1294 x+j_coord_offsetG,x+j_coord_offsetH,
1297 /* Calculate displacement vector */
1298 dx00 = _mm256_sub_ps(ix0,jx0);
1299 dy00 = _mm256_sub_ps(iy0,jy0);
1300 dz00 = _mm256_sub_ps(iz0,jz0);
1301 dx10 = _mm256_sub_ps(ix1,jx0);
1302 dy10 = _mm256_sub_ps(iy1,jy0);
1303 dz10 = _mm256_sub_ps(iz1,jz0);
1304 dx20 = _mm256_sub_ps(ix2,jx0);
1305 dy20 = _mm256_sub_ps(iy2,jy0);
1306 dz20 = _mm256_sub_ps(iz2,jz0);
1307 dx30 = _mm256_sub_ps(ix3,jx0);
1308 dy30 = _mm256_sub_ps(iy3,jy0);
1309 dz30 = _mm256_sub_ps(iz3,jz0);
1311 /* Calculate squared distance and things based on it */
1312 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1313 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1314 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1315 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1317 rinv10 = avx256_invsqrt_f(rsq10);
1318 rinv20 = avx256_invsqrt_f(rsq20);
1319 rinv30 = avx256_invsqrt_f(rsq30);
1321 rinvsq00 = avx256_inv_f(rsq00);
1323 /* Load parameters for j particles */
1324 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1325 charge+jnrC+0,charge+jnrD+0,
1326 charge+jnrE+0,charge+jnrF+0,
1327 charge+jnrG+0,charge+jnrH+0);
1328 vdwjidx0A = 2*vdwtype[jnrA+0];
1329 vdwjidx0B = 2*vdwtype[jnrB+0];
1330 vdwjidx0C = 2*vdwtype[jnrC+0];
1331 vdwjidx0D = 2*vdwtype[jnrD+0];
1332 vdwjidx0E = 2*vdwtype[jnrE+0];
1333 vdwjidx0F = 2*vdwtype[jnrF+0];
1334 vdwjidx0G = 2*vdwtype[jnrG+0];
1335 vdwjidx0H = 2*vdwtype[jnrH+0];
1337 fjx0 = _mm256_setzero_ps();
1338 fjy0 = _mm256_setzero_ps();
1339 fjz0 = _mm256_setzero_ps();
1341 /**************************
1342 * CALCULATE INTERACTIONS *
1343 **************************/
1345 /* Compute parameters for interactions between i and j atoms */
1346 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1347 vdwioffsetptr0+vdwjidx0B,
1348 vdwioffsetptr0+vdwjidx0C,
1349 vdwioffsetptr0+vdwjidx0D,
1350 vdwioffsetptr0+vdwjidx0E,
1351 vdwioffsetptr0+vdwjidx0F,
1352 vdwioffsetptr0+vdwjidx0G,
1353 vdwioffsetptr0+vdwjidx0H,
1356 /* LENNARD-JONES DISPERSION/REPULSION */
1358 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1359 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1363 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1365 /* Calculate temporary vectorial force */
1366 tx = _mm256_mul_ps(fscal,dx00);
1367 ty = _mm256_mul_ps(fscal,dy00);
1368 tz = _mm256_mul_ps(fscal,dz00);
1370 /* Update vectorial force */
1371 fix0 = _mm256_add_ps(fix0,tx);
1372 fiy0 = _mm256_add_ps(fiy0,ty);
1373 fiz0 = _mm256_add_ps(fiz0,tz);
1375 fjx0 = _mm256_add_ps(fjx0,tx);
1376 fjy0 = _mm256_add_ps(fjy0,ty);
1377 fjz0 = _mm256_add_ps(fjz0,tz);
1379 /**************************
1380 * CALCULATE INTERACTIONS *
1381 **************************/
1383 r10 = _mm256_mul_ps(rsq10,rinv10);
1384 r10 = _mm256_andnot_ps(dummy_mask,r10);
1386 /* Compute parameters for interactions between i and j atoms */
1387 qq10 = _mm256_mul_ps(iq1,jq0);
1389 /* Calculate table index by multiplying r with table scale and truncate to integer */
1390 rt = _mm256_mul_ps(r10,vftabscale);
1391 vfitab = _mm256_cvttps_epi32(rt);
1392 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1393 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1394 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1395 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1396 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1397 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1399 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1400 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1401 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1402 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1403 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1404 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1405 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1406 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1407 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1408 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1409 Heps = _mm256_mul_ps(vfeps,H);
1410 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1411 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1412 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1416 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1418 /* Calculate temporary vectorial force */
1419 tx = _mm256_mul_ps(fscal,dx10);
1420 ty = _mm256_mul_ps(fscal,dy10);
1421 tz = _mm256_mul_ps(fscal,dz10);
1423 /* Update vectorial force */
1424 fix1 = _mm256_add_ps(fix1,tx);
1425 fiy1 = _mm256_add_ps(fiy1,ty);
1426 fiz1 = _mm256_add_ps(fiz1,tz);
1428 fjx0 = _mm256_add_ps(fjx0,tx);
1429 fjy0 = _mm256_add_ps(fjy0,ty);
1430 fjz0 = _mm256_add_ps(fjz0,tz);
1432 /**************************
1433 * CALCULATE INTERACTIONS *
1434 **************************/
1436 r20 = _mm256_mul_ps(rsq20,rinv20);
1437 r20 = _mm256_andnot_ps(dummy_mask,r20);
1439 /* Compute parameters for interactions between i and j atoms */
1440 qq20 = _mm256_mul_ps(iq2,jq0);
1442 /* Calculate table index by multiplying r with table scale and truncate to integer */
1443 rt = _mm256_mul_ps(r20,vftabscale);
1444 vfitab = _mm256_cvttps_epi32(rt);
1445 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1446 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1447 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1448 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1449 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1450 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1452 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1453 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1454 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1455 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1456 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1457 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1458 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1459 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1460 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1461 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1462 Heps = _mm256_mul_ps(vfeps,H);
1463 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1464 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1465 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1469 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1471 /* Calculate temporary vectorial force */
1472 tx = _mm256_mul_ps(fscal,dx20);
1473 ty = _mm256_mul_ps(fscal,dy20);
1474 tz = _mm256_mul_ps(fscal,dz20);
1476 /* Update vectorial force */
1477 fix2 = _mm256_add_ps(fix2,tx);
1478 fiy2 = _mm256_add_ps(fiy2,ty);
1479 fiz2 = _mm256_add_ps(fiz2,tz);
1481 fjx0 = _mm256_add_ps(fjx0,tx);
1482 fjy0 = _mm256_add_ps(fjy0,ty);
1483 fjz0 = _mm256_add_ps(fjz0,tz);
1485 /**************************
1486 * CALCULATE INTERACTIONS *
1487 **************************/
1489 r30 = _mm256_mul_ps(rsq30,rinv30);
1490 r30 = _mm256_andnot_ps(dummy_mask,r30);
1492 /* Compute parameters for interactions between i and j atoms */
1493 qq30 = _mm256_mul_ps(iq3,jq0);
1495 /* Calculate table index by multiplying r with table scale and truncate to integer */
1496 rt = _mm256_mul_ps(r30,vftabscale);
1497 vfitab = _mm256_cvttps_epi32(rt);
1498 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1499 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1500 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1501 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1502 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1503 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1505 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1506 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1507 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1508 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1509 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1510 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1511 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1512 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1513 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1514 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1515 Heps = _mm256_mul_ps(vfeps,H);
1516 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1517 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1518 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq30,FF),_mm256_mul_ps(vftabscale,rinv30)));
1522 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1524 /* Calculate temporary vectorial force */
1525 tx = _mm256_mul_ps(fscal,dx30);
1526 ty = _mm256_mul_ps(fscal,dy30);
1527 tz = _mm256_mul_ps(fscal,dz30);
1529 /* Update vectorial force */
1530 fix3 = _mm256_add_ps(fix3,tx);
1531 fiy3 = _mm256_add_ps(fiy3,ty);
1532 fiz3 = _mm256_add_ps(fiz3,tz);
1534 fjx0 = _mm256_add_ps(fjx0,tx);
1535 fjy0 = _mm256_add_ps(fjy0,ty);
1536 fjz0 = _mm256_add_ps(fjz0,tz);
1538 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1539 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1540 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1541 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1542 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1543 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1544 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1545 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1547 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1549 /* Inner loop uses 150 flops */
1552 /* End of innermost loop */
1554 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1555 f+i_coord_offset,fshift+i_shift_offset);
1557 /* Increment number of inner iterations */
1558 inneriter += j_index_end - j_index_start;
1560 /* Outer loop uses 24 flops */
1563 /* Increment number of outer iterations */
1566 /* Update outer/inner flops */
1568 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*150);