2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_single
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: LennardJones
56 * Geometry: Water4-Particle
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr0;
89 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90 real * vdwioffsetptr1;
91 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92 real * vdwioffsetptr2;
93 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94 real * vdwioffsetptr3;
95 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
96 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
97 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
98 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
100 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
101 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
102 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
105 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
108 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
109 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
111 __m128i vfitab_lo,vfitab_hi;
112 __m128i ifour = _mm_set1_epi32(4);
113 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
115 __m256 dummy_mask,cutoff_mask;
116 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
117 __m256 one = _mm256_set1_ps(1.0);
118 __m256 two = _mm256_set1_ps(2.0);
124 jindex = nlist->jindex;
126 shiftidx = nlist->shift;
128 shiftvec = fr->shift_vec[0];
129 fshift = fr->fshift[0];
130 facel = _mm256_set1_ps(fr->epsfac);
131 charge = mdatoms->chargeA;
132 nvdwtype = fr->ntype;
134 vdwtype = mdatoms->typeA;
136 vftab = kernel_data->table_elec->data;
137 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
142 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
143 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
144 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
146 /* Avoid stupid compiler warnings */
147 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
160 for(iidx=0;iidx<4*DIM;iidx++)
165 /* Start outer loop over neighborlists */
166 for(iidx=0; iidx<nri; iidx++)
168 /* Load shift vector for this list */
169 i_shift_offset = DIM*shiftidx[iidx];
171 /* Load limits for loop over neighbors */
172 j_index_start = jindex[iidx];
173 j_index_end = jindex[iidx+1];
175 /* Get outer coordinate index */
177 i_coord_offset = DIM*inr;
179 /* Load i particle coords and add shift vector */
180 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
181 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
183 fix0 = _mm256_setzero_ps();
184 fiy0 = _mm256_setzero_ps();
185 fiz0 = _mm256_setzero_ps();
186 fix1 = _mm256_setzero_ps();
187 fiy1 = _mm256_setzero_ps();
188 fiz1 = _mm256_setzero_ps();
189 fix2 = _mm256_setzero_ps();
190 fiy2 = _mm256_setzero_ps();
191 fiz2 = _mm256_setzero_ps();
192 fix3 = _mm256_setzero_ps();
193 fiy3 = _mm256_setzero_ps();
194 fiz3 = _mm256_setzero_ps();
196 /* Reset potential sums */
197 velecsum = _mm256_setzero_ps();
198 vvdwsum = _mm256_setzero_ps();
200 /* Start inner kernel loop */
201 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
204 /* Get j neighbor index, and coordinate index */
213 j_coord_offsetA = DIM*jnrA;
214 j_coord_offsetB = DIM*jnrB;
215 j_coord_offsetC = DIM*jnrC;
216 j_coord_offsetD = DIM*jnrD;
217 j_coord_offsetE = DIM*jnrE;
218 j_coord_offsetF = DIM*jnrF;
219 j_coord_offsetG = DIM*jnrG;
220 j_coord_offsetH = DIM*jnrH;
222 /* load j atom coordinates */
223 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
224 x+j_coord_offsetC,x+j_coord_offsetD,
225 x+j_coord_offsetE,x+j_coord_offsetF,
226 x+j_coord_offsetG,x+j_coord_offsetH,
229 /* Calculate displacement vector */
230 dx00 = _mm256_sub_ps(ix0,jx0);
231 dy00 = _mm256_sub_ps(iy0,jy0);
232 dz00 = _mm256_sub_ps(iz0,jz0);
233 dx10 = _mm256_sub_ps(ix1,jx0);
234 dy10 = _mm256_sub_ps(iy1,jy0);
235 dz10 = _mm256_sub_ps(iz1,jz0);
236 dx20 = _mm256_sub_ps(ix2,jx0);
237 dy20 = _mm256_sub_ps(iy2,jy0);
238 dz20 = _mm256_sub_ps(iz2,jz0);
239 dx30 = _mm256_sub_ps(ix3,jx0);
240 dy30 = _mm256_sub_ps(iy3,jy0);
241 dz30 = _mm256_sub_ps(iz3,jz0);
243 /* Calculate squared distance and things based on it */
244 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
245 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
246 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
247 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
249 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
250 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
251 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
253 rinvsq00 = gmx_mm256_inv_ps(rsq00);
255 /* Load parameters for j particles */
256 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
257 charge+jnrC+0,charge+jnrD+0,
258 charge+jnrE+0,charge+jnrF+0,
259 charge+jnrG+0,charge+jnrH+0);
260 vdwjidx0A = 2*vdwtype[jnrA+0];
261 vdwjidx0B = 2*vdwtype[jnrB+0];
262 vdwjidx0C = 2*vdwtype[jnrC+0];
263 vdwjidx0D = 2*vdwtype[jnrD+0];
264 vdwjidx0E = 2*vdwtype[jnrE+0];
265 vdwjidx0F = 2*vdwtype[jnrF+0];
266 vdwjidx0G = 2*vdwtype[jnrG+0];
267 vdwjidx0H = 2*vdwtype[jnrH+0];
269 fjx0 = _mm256_setzero_ps();
270 fjy0 = _mm256_setzero_ps();
271 fjz0 = _mm256_setzero_ps();
273 /**************************
274 * CALCULATE INTERACTIONS *
275 **************************/
277 /* Compute parameters for interactions between i and j atoms */
278 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
279 vdwioffsetptr0+vdwjidx0B,
280 vdwioffsetptr0+vdwjidx0C,
281 vdwioffsetptr0+vdwjidx0D,
282 vdwioffsetptr0+vdwjidx0E,
283 vdwioffsetptr0+vdwjidx0F,
284 vdwioffsetptr0+vdwjidx0G,
285 vdwioffsetptr0+vdwjidx0H,
288 /* LENNARD-JONES DISPERSION/REPULSION */
290 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
291 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
292 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
293 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
294 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
296 /* Update potential sum for this i atom from the interaction with this j atom. */
297 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
301 /* Calculate temporary vectorial force */
302 tx = _mm256_mul_ps(fscal,dx00);
303 ty = _mm256_mul_ps(fscal,dy00);
304 tz = _mm256_mul_ps(fscal,dz00);
306 /* Update vectorial force */
307 fix0 = _mm256_add_ps(fix0,tx);
308 fiy0 = _mm256_add_ps(fiy0,ty);
309 fiz0 = _mm256_add_ps(fiz0,tz);
311 fjx0 = _mm256_add_ps(fjx0,tx);
312 fjy0 = _mm256_add_ps(fjy0,ty);
313 fjz0 = _mm256_add_ps(fjz0,tz);
315 /**************************
316 * CALCULATE INTERACTIONS *
317 **************************/
319 r10 = _mm256_mul_ps(rsq10,rinv10);
321 /* Compute parameters for interactions between i and j atoms */
322 qq10 = _mm256_mul_ps(iq1,jq0);
324 /* Calculate table index by multiplying r with table scale and truncate to integer */
325 rt = _mm256_mul_ps(r10,vftabscale);
326 vfitab = _mm256_cvttps_epi32(rt);
327 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
328 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
329 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
330 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
331 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
332 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
334 /* CUBIC SPLINE TABLE ELECTROSTATICS */
335 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
336 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
337 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
338 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
339 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
340 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
341 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
342 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
343 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
344 Heps = _mm256_mul_ps(vfeps,H);
345 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
346 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
347 velec = _mm256_mul_ps(qq10,VV);
348 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
349 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
351 /* Update potential sum for this i atom from the interaction with this j atom. */
352 velecsum = _mm256_add_ps(velecsum,velec);
356 /* Calculate temporary vectorial force */
357 tx = _mm256_mul_ps(fscal,dx10);
358 ty = _mm256_mul_ps(fscal,dy10);
359 tz = _mm256_mul_ps(fscal,dz10);
361 /* Update vectorial force */
362 fix1 = _mm256_add_ps(fix1,tx);
363 fiy1 = _mm256_add_ps(fiy1,ty);
364 fiz1 = _mm256_add_ps(fiz1,tz);
366 fjx0 = _mm256_add_ps(fjx0,tx);
367 fjy0 = _mm256_add_ps(fjy0,ty);
368 fjz0 = _mm256_add_ps(fjz0,tz);
370 /**************************
371 * CALCULATE INTERACTIONS *
372 **************************/
374 r20 = _mm256_mul_ps(rsq20,rinv20);
376 /* Compute parameters for interactions between i and j atoms */
377 qq20 = _mm256_mul_ps(iq2,jq0);
379 /* Calculate table index by multiplying r with table scale and truncate to integer */
380 rt = _mm256_mul_ps(r20,vftabscale);
381 vfitab = _mm256_cvttps_epi32(rt);
382 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
383 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
384 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
385 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
386 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
387 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
389 /* CUBIC SPLINE TABLE ELECTROSTATICS */
390 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
391 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
392 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
393 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
394 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
395 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
396 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
397 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
398 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
399 Heps = _mm256_mul_ps(vfeps,H);
400 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
401 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
402 velec = _mm256_mul_ps(qq20,VV);
403 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
404 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
406 /* Update potential sum for this i atom from the interaction with this j atom. */
407 velecsum = _mm256_add_ps(velecsum,velec);
411 /* Calculate temporary vectorial force */
412 tx = _mm256_mul_ps(fscal,dx20);
413 ty = _mm256_mul_ps(fscal,dy20);
414 tz = _mm256_mul_ps(fscal,dz20);
416 /* Update vectorial force */
417 fix2 = _mm256_add_ps(fix2,tx);
418 fiy2 = _mm256_add_ps(fiy2,ty);
419 fiz2 = _mm256_add_ps(fiz2,tz);
421 fjx0 = _mm256_add_ps(fjx0,tx);
422 fjy0 = _mm256_add_ps(fjy0,ty);
423 fjz0 = _mm256_add_ps(fjz0,tz);
425 /**************************
426 * CALCULATE INTERACTIONS *
427 **************************/
429 r30 = _mm256_mul_ps(rsq30,rinv30);
431 /* Compute parameters for interactions between i and j atoms */
432 qq30 = _mm256_mul_ps(iq3,jq0);
434 /* Calculate table index by multiplying r with table scale and truncate to integer */
435 rt = _mm256_mul_ps(r30,vftabscale);
436 vfitab = _mm256_cvttps_epi32(rt);
437 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
438 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
439 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
440 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
441 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
442 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
444 /* CUBIC SPLINE TABLE ELECTROSTATICS */
445 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
446 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
447 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
448 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
449 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
450 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
451 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
452 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
453 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
454 Heps = _mm256_mul_ps(vfeps,H);
455 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
456 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
457 velec = _mm256_mul_ps(qq30,VV);
458 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
459 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq30,FF),_mm256_mul_ps(vftabscale,rinv30)));
461 /* Update potential sum for this i atom from the interaction with this j atom. */
462 velecsum = _mm256_add_ps(velecsum,velec);
466 /* Calculate temporary vectorial force */
467 tx = _mm256_mul_ps(fscal,dx30);
468 ty = _mm256_mul_ps(fscal,dy30);
469 tz = _mm256_mul_ps(fscal,dz30);
471 /* Update vectorial force */
472 fix3 = _mm256_add_ps(fix3,tx);
473 fiy3 = _mm256_add_ps(fiy3,ty);
474 fiz3 = _mm256_add_ps(fiz3,tz);
476 fjx0 = _mm256_add_ps(fjx0,tx);
477 fjy0 = _mm256_add_ps(fjy0,ty);
478 fjz0 = _mm256_add_ps(fjz0,tz);
480 fjptrA = f+j_coord_offsetA;
481 fjptrB = f+j_coord_offsetB;
482 fjptrC = f+j_coord_offsetC;
483 fjptrD = f+j_coord_offsetD;
484 fjptrE = f+j_coord_offsetE;
485 fjptrF = f+j_coord_offsetF;
486 fjptrG = f+j_coord_offsetG;
487 fjptrH = f+j_coord_offsetH;
489 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
491 /* Inner loop uses 164 flops */
497 /* Get j neighbor index, and coordinate index */
498 jnrlistA = jjnr[jidx];
499 jnrlistB = jjnr[jidx+1];
500 jnrlistC = jjnr[jidx+2];
501 jnrlistD = jjnr[jidx+3];
502 jnrlistE = jjnr[jidx+4];
503 jnrlistF = jjnr[jidx+5];
504 jnrlistG = jjnr[jidx+6];
505 jnrlistH = jjnr[jidx+7];
506 /* Sign of each element will be negative for non-real atoms.
507 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
508 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
510 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
511 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
513 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
514 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
515 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
516 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
517 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
518 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
519 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
520 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
521 j_coord_offsetA = DIM*jnrA;
522 j_coord_offsetB = DIM*jnrB;
523 j_coord_offsetC = DIM*jnrC;
524 j_coord_offsetD = DIM*jnrD;
525 j_coord_offsetE = DIM*jnrE;
526 j_coord_offsetF = DIM*jnrF;
527 j_coord_offsetG = DIM*jnrG;
528 j_coord_offsetH = DIM*jnrH;
530 /* load j atom coordinates */
531 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
532 x+j_coord_offsetC,x+j_coord_offsetD,
533 x+j_coord_offsetE,x+j_coord_offsetF,
534 x+j_coord_offsetG,x+j_coord_offsetH,
537 /* Calculate displacement vector */
538 dx00 = _mm256_sub_ps(ix0,jx0);
539 dy00 = _mm256_sub_ps(iy0,jy0);
540 dz00 = _mm256_sub_ps(iz0,jz0);
541 dx10 = _mm256_sub_ps(ix1,jx0);
542 dy10 = _mm256_sub_ps(iy1,jy0);
543 dz10 = _mm256_sub_ps(iz1,jz0);
544 dx20 = _mm256_sub_ps(ix2,jx0);
545 dy20 = _mm256_sub_ps(iy2,jy0);
546 dz20 = _mm256_sub_ps(iz2,jz0);
547 dx30 = _mm256_sub_ps(ix3,jx0);
548 dy30 = _mm256_sub_ps(iy3,jy0);
549 dz30 = _mm256_sub_ps(iz3,jz0);
551 /* Calculate squared distance and things based on it */
552 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
553 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
554 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
555 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
557 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
558 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
559 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
561 rinvsq00 = gmx_mm256_inv_ps(rsq00);
563 /* Load parameters for j particles */
564 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
565 charge+jnrC+0,charge+jnrD+0,
566 charge+jnrE+0,charge+jnrF+0,
567 charge+jnrG+0,charge+jnrH+0);
568 vdwjidx0A = 2*vdwtype[jnrA+0];
569 vdwjidx0B = 2*vdwtype[jnrB+0];
570 vdwjidx0C = 2*vdwtype[jnrC+0];
571 vdwjidx0D = 2*vdwtype[jnrD+0];
572 vdwjidx0E = 2*vdwtype[jnrE+0];
573 vdwjidx0F = 2*vdwtype[jnrF+0];
574 vdwjidx0G = 2*vdwtype[jnrG+0];
575 vdwjidx0H = 2*vdwtype[jnrH+0];
577 fjx0 = _mm256_setzero_ps();
578 fjy0 = _mm256_setzero_ps();
579 fjz0 = _mm256_setzero_ps();
581 /**************************
582 * CALCULATE INTERACTIONS *
583 **************************/
585 /* Compute parameters for interactions between i and j atoms */
586 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
587 vdwioffsetptr0+vdwjidx0B,
588 vdwioffsetptr0+vdwjidx0C,
589 vdwioffsetptr0+vdwjidx0D,
590 vdwioffsetptr0+vdwjidx0E,
591 vdwioffsetptr0+vdwjidx0F,
592 vdwioffsetptr0+vdwjidx0G,
593 vdwioffsetptr0+vdwjidx0H,
596 /* LENNARD-JONES DISPERSION/REPULSION */
598 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
599 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
600 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
601 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
602 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
604 /* Update potential sum for this i atom from the interaction with this j atom. */
605 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
606 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
610 fscal = _mm256_andnot_ps(dummy_mask,fscal);
612 /* Calculate temporary vectorial force */
613 tx = _mm256_mul_ps(fscal,dx00);
614 ty = _mm256_mul_ps(fscal,dy00);
615 tz = _mm256_mul_ps(fscal,dz00);
617 /* Update vectorial force */
618 fix0 = _mm256_add_ps(fix0,tx);
619 fiy0 = _mm256_add_ps(fiy0,ty);
620 fiz0 = _mm256_add_ps(fiz0,tz);
622 fjx0 = _mm256_add_ps(fjx0,tx);
623 fjy0 = _mm256_add_ps(fjy0,ty);
624 fjz0 = _mm256_add_ps(fjz0,tz);
626 /**************************
627 * CALCULATE INTERACTIONS *
628 **************************/
630 r10 = _mm256_mul_ps(rsq10,rinv10);
631 r10 = _mm256_andnot_ps(dummy_mask,r10);
633 /* Compute parameters for interactions between i and j atoms */
634 qq10 = _mm256_mul_ps(iq1,jq0);
636 /* Calculate table index by multiplying r with table scale and truncate to integer */
637 rt = _mm256_mul_ps(r10,vftabscale);
638 vfitab = _mm256_cvttps_epi32(rt);
639 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
640 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
641 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
642 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
643 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
644 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
646 /* CUBIC SPLINE TABLE ELECTROSTATICS */
647 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
648 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
649 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
650 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
651 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
652 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
653 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
654 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
655 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
656 Heps = _mm256_mul_ps(vfeps,H);
657 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
658 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
659 velec = _mm256_mul_ps(qq10,VV);
660 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
661 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
663 /* Update potential sum for this i atom from the interaction with this j atom. */
664 velec = _mm256_andnot_ps(dummy_mask,velec);
665 velecsum = _mm256_add_ps(velecsum,velec);
669 fscal = _mm256_andnot_ps(dummy_mask,fscal);
671 /* Calculate temporary vectorial force */
672 tx = _mm256_mul_ps(fscal,dx10);
673 ty = _mm256_mul_ps(fscal,dy10);
674 tz = _mm256_mul_ps(fscal,dz10);
676 /* Update vectorial force */
677 fix1 = _mm256_add_ps(fix1,tx);
678 fiy1 = _mm256_add_ps(fiy1,ty);
679 fiz1 = _mm256_add_ps(fiz1,tz);
681 fjx0 = _mm256_add_ps(fjx0,tx);
682 fjy0 = _mm256_add_ps(fjy0,ty);
683 fjz0 = _mm256_add_ps(fjz0,tz);
685 /**************************
686 * CALCULATE INTERACTIONS *
687 **************************/
689 r20 = _mm256_mul_ps(rsq20,rinv20);
690 r20 = _mm256_andnot_ps(dummy_mask,r20);
692 /* Compute parameters for interactions between i and j atoms */
693 qq20 = _mm256_mul_ps(iq2,jq0);
695 /* Calculate table index by multiplying r with table scale and truncate to integer */
696 rt = _mm256_mul_ps(r20,vftabscale);
697 vfitab = _mm256_cvttps_epi32(rt);
698 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
699 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
700 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
701 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
702 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
703 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
705 /* CUBIC SPLINE TABLE ELECTROSTATICS */
706 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
707 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
708 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
709 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
710 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
711 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
712 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
713 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
714 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
715 Heps = _mm256_mul_ps(vfeps,H);
716 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
717 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
718 velec = _mm256_mul_ps(qq20,VV);
719 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
720 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
722 /* Update potential sum for this i atom from the interaction with this j atom. */
723 velec = _mm256_andnot_ps(dummy_mask,velec);
724 velecsum = _mm256_add_ps(velecsum,velec);
728 fscal = _mm256_andnot_ps(dummy_mask,fscal);
730 /* Calculate temporary vectorial force */
731 tx = _mm256_mul_ps(fscal,dx20);
732 ty = _mm256_mul_ps(fscal,dy20);
733 tz = _mm256_mul_ps(fscal,dz20);
735 /* Update vectorial force */
736 fix2 = _mm256_add_ps(fix2,tx);
737 fiy2 = _mm256_add_ps(fiy2,ty);
738 fiz2 = _mm256_add_ps(fiz2,tz);
740 fjx0 = _mm256_add_ps(fjx0,tx);
741 fjy0 = _mm256_add_ps(fjy0,ty);
742 fjz0 = _mm256_add_ps(fjz0,tz);
744 /**************************
745 * CALCULATE INTERACTIONS *
746 **************************/
748 r30 = _mm256_mul_ps(rsq30,rinv30);
749 r30 = _mm256_andnot_ps(dummy_mask,r30);
751 /* Compute parameters for interactions between i and j atoms */
752 qq30 = _mm256_mul_ps(iq3,jq0);
754 /* Calculate table index by multiplying r with table scale and truncate to integer */
755 rt = _mm256_mul_ps(r30,vftabscale);
756 vfitab = _mm256_cvttps_epi32(rt);
757 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
758 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
759 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
760 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
761 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
762 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
764 /* CUBIC SPLINE TABLE ELECTROSTATICS */
765 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
766 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
767 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
768 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
769 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
770 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
771 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
772 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
773 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
774 Heps = _mm256_mul_ps(vfeps,H);
775 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
776 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
777 velec = _mm256_mul_ps(qq30,VV);
778 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
779 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq30,FF),_mm256_mul_ps(vftabscale,rinv30)));
781 /* Update potential sum for this i atom from the interaction with this j atom. */
782 velec = _mm256_andnot_ps(dummy_mask,velec);
783 velecsum = _mm256_add_ps(velecsum,velec);
787 fscal = _mm256_andnot_ps(dummy_mask,fscal);
789 /* Calculate temporary vectorial force */
790 tx = _mm256_mul_ps(fscal,dx30);
791 ty = _mm256_mul_ps(fscal,dy30);
792 tz = _mm256_mul_ps(fscal,dz30);
794 /* Update vectorial force */
795 fix3 = _mm256_add_ps(fix3,tx);
796 fiy3 = _mm256_add_ps(fiy3,ty);
797 fiz3 = _mm256_add_ps(fiz3,tz);
799 fjx0 = _mm256_add_ps(fjx0,tx);
800 fjy0 = _mm256_add_ps(fjy0,ty);
801 fjz0 = _mm256_add_ps(fjz0,tz);
803 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
804 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
805 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
806 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
807 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
808 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
809 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
810 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
812 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
814 /* Inner loop uses 167 flops */
817 /* End of innermost loop */
819 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
820 f+i_coord_offset,fshift+i_shift_offset);
823 /* Update potential energies */
824 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
825 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
827 /* Increment number of inner iterations */
828 inneriter += j_index_end - j_index_start;
830 /* Outer loop uses 26 flops */
833 /* Increment number of outer iterations */
836 /* Update outer/inner flops */
838 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*167);
841 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_single
842 * Electrostatics interaction: CubicSplineTable
843 * VdW interaction: LennardJones
844 * Geometry: Water4-Particle
845 * Calculate force/pot: Force
848 nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_single
849 (t_nblist * gmx_restrict nlist,
850 rvec * gmx_restrict xx,
851 rvec * gmx_restrict ff,
852 t_forcerec * gmx_restrict fr,
853 t_mdatoms * gmx_restrict mdatoms,
854 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
855 t_nrnb * gmx_restrict nrnb)
857 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
858 * just 0 for non-waters.
859 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
860 * jnr indices corresponding to data put in the four positions in the SIMD register.
862 int i_shift_offset,i_coord_offset,outeriter,inneriter;
863 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
864 int jnrA,jnrB,jnrC,jnrD;
865 int jnrE,jnrF,jnrG,jnrH;
866 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
867 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
868 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
869 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
870 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
872 real *shiftvec,*fshift,*x,*f;
873 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
875 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
876 real * vdwioffsetptr0;
877 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
878 real * vdwioffsetptr1;
879 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
880 real * vdwioffsetptr2;
881 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
882 real * vdwioffsetptr3;
883 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
884 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
885 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
886 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
887 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
888 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
889 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
890 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
893 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
896 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
897 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
899 __m128i vfitab_lo,vfitab_hi;
900 __m128i ifour = _mm_set1_epi32(4);
901 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
903 __m256 dummy_mask,cutoff_mask;
904 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
905 __m256 one = _mm256_set1_ps(1.0);
906 __m256 two = _mm256_set1_ps(2.0);
912 jindex = nlist->jindex;
914 shiftidx = nlist->shift;
916 shiftvec = fr->shift_vec[0];
917 fshift = fr->fshift[0];
918 facel = _mm256_set1_ps(fr->epsfac);
919 charge = mdatoms->chargeA;
920 nvdwtype = fr->ntype;
922 vdwtype = mdatoms->typeA;
924 vftab = kernel_data->table_elec->data;
925 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
927 /* Setup water-specific parameters */
928 inr = nlist->iinr[0];
929 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
930 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
931 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
932 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
934 /* Avoid stupid compiler warnings */
935 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
948 for(iidx=0;iidx<4*DIM;iidx++)
953 /* Start outer loop over neighborlists */
954 for(iidx=0; iidx<nri; iidx++)
956 /* Load shift vector for this list */
957 i_shift_offset = DIM*shiftidx[iidx];
959 /* Load limits for loop over neighbors */
960 j_index_start = jindex[iidx];
961 j_index_end = jindex[iidx+1];
963 /* Get outer coordinate index */
965 i_coord_offset = DIM*inr;
967 /* Load i particle coords and add shift vector */
968 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
969 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
971 fix0 = _mm256_setzero_ps();
972 fiy0 = _mm256_setzero_ps();
973 fiz0 = _mm256_setzero_ps();
974 fix1 = _mm256_setzero_ps();
975 fiy1 = _mm256_setzero_ps();
976 fiz1 = _mm256_setzero_ps();
977 fix2 = _mm256_setzero_ps();
978 fiy2 = _mm256_setzero_ps();
979 fiz2 = _mm256_setzero_ps();
980 fix3 = _mm256_setzero_ps();
981 fiy3 = _mm256_setzero_ps();
982 fiz3 = _mm256_setzero_ps();
984 /* Start inner kernel loop */
985 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
988 /* Get j neighbor index, and coordinate index */
997 j_coord_offsetA = DIM*jnrA;
998 j_coord_offsetB = DIM*jnrB;
999 j_coord_offsetC = DIM*jnrC;
1000 j_coord_offsetD = DIM*jnrD;
1001 j_coord_offsetE = DIM*jnrE;
1002 j_coord_offsetF = DIM*jnrF;
1003 j_coord_offsetG = DIM*jnrG;
1004 j_coord_offsetH = DIM*jnrH;
1006 /* load j atom coordinates */
1007 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1008 x+j_coord_offsetC,x+j_coord_offsetD,
1009 x+j_coord_offsetE,x+j_coord_offsetF,
1010 x+j_coord_offsetG,x+j_coord_offsetH,
1013 /* Calculate displacement vector */
1014 dx00 = _mm256_sub_ps(ix0,jx0);
1015 dy00 = _mm256_sub_ps(iy0,jy0);
1016 dz00 = _mm256_sub_ps(iz0,jz0);
1017 dx10 = _mm256_sub_ps(ix1,jx0);
1018 dy10 = _mm256_sub_ps(iy1,jy0);
1019 dz10 = _mm256_sub_ps(iz1,jz0);
1020 dx20 = _mm256_sub_ps(ix2,jx0);
1021 dy20 = _mm256_sub_ps(iy2,jy0);
1022 dz20 = _mm256_sub_ps(iz2,jz0);
1023 dx30 = _mm256_sub_ps(ix3,jx0);
1024 dy30 = _mm256_sub_ps(iy3,jy0);
1025 dz30 = _mm256_sub_ps(iz3,jz0);
1027 /* Calculate squared distance and things based on it */
1028 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1029 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1030 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1031 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1033 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1034 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1035 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
1037 rinvsq00 = gmx_mm256_inv_ps(rsq00);
1039 /* Load parameters for j particles */
1040 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1041 charge+jnrC+0,charge+jnrD+0,
1042 charge+jnrE+0,charge+jnrF+0,
1043 charge+jnrG+0,charge+jnrH+0);
1044 vdwjidx0A = 2*vdwtype[jnrA+0];
1045 vdwjidx0B = 2*vdwtype[jnrB+0];
1046 vdwjidx0C = 2*vdwtype[jnrC+0];
1047 vdwjidx0D = 2*vdwtype[jnrD+0];
1048 vdwjidx0E = 2*vdwtype[jnrE+0];
1049 vdwjidx0F = 2*vdwtype[jnrF+0];
1050 vdwjidx0G = 2*vdwtype[jnrG+0];
1051 vdwjidx0H = 2*vdwtype[jnrH+0];
1053 fjx0 = _mm256_setzero_ps();
1054 fjy0 = _mm256_setzero_ps();
1055 fjz0 = _mm256_setzero_ps();
1057 /**************************
1058 * CALCULATE INTERACTIONS *
1059 **************************/
1061 /* Compute parameters for interactions between i and j atoms */
1062 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1063 vdwioffsetptr0+vdwjidx0B,
1064 vdwioffsetptr0+vdwjidx0C,
1065 vdwioffsetptr0+vdwjidx0D,
1066 vdwioffsetptr0+vdwjidx0E,
1067 vdwioffsetptr0+vdwjidx0F,
1068 vdwioffsetptr0+vdwjidx0G,
1069 vdwioffsetptr0+vdwjidx0H,
1072 /* LENNARD-JONES DISPERSION/REPULSION */
1074 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1075 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1079 /* Calculate temporary vectorial force */
1080 tx = _mm256_mul_ps(fscal,dx00);
1081 ty = _mm256_mul_ps(fscal,dy00);
1082 tz = _mm256_mul_ps(fscal,dz00);
1084 /* Update vectorial force */
1085 fix0 = _mm256_add_ps(fix0,tx);
1086 fiy0 = _mm256_add_ps(fiy0,ty);
1087 fiz0 = _mm256_add_ps(fiz0,tz);
1089 fjx0 = _mm256_add_ps(fjx0,tx);
1090 fjy0 = _mm256_add_ps(fjy0,ty);
1091 fjz0 = _mm256_add_ps(fjz0,tz);
1093 /**************************
1094 * CALCULATE INTERACTIONS *
1095 **************************/
1097 r10 = _mm256_mul_ps(rsq10,rinv10);
1099 /* Compute parameters for interactions between i and j atoms */
1100 qq10 = _mm256_mul_ps(iq1,jq0);
1102 /* Calculate table index by multiplying r with table scale and truncate to integer */
1103 rt = _mm256_mul_ps(r10,vftabscale);
1104 vfitab = _mm256_cvttps_epi32(rt);
1105 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1106 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1107 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1108 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1109 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1110 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1112 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1113 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1114 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1115 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1116 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1117 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1118 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1119 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1120 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1121 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1122 Heps = _mm256_mul_ps(vfeps,H);
1123 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1124 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1125 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1129 /* Calculate temporary vectorial force */
1130 tx = _mm256_mul_ps(fscal,dx10);
1131 ty = _mm256_mul_ps(fscal,dy10);
1132 tz = _mm256_mul_ps(fscal,dz10);
1134 /* Update vectorial force */
1135 fix1 = _mm256_add_ps(fix1,tx);
1136 fiy1 = _mm256_add_ps(fiy1,ty);
1137 fiz1 = _mm256_add_ps(fiz1,tz);
1139 fjx0 = _mm256_add_ps(fjx0,tx);
1140 fjy0 = _mm256_add_ps(fjy0,ty);
1141 fjz0 = _mm256_add_ps(fjz0,tz);
1143 /**************************
1144 * CALCULATE INTERACTIONS *
1145 **************************/
1147 r20 = _mm256_mul_ps(rsq20,rinv20);
1149 /* Compute parameters for interactions between i and j atoms */
1150 qq20 = _mm256_mul_ps(iq2,jq0);
1152 /* Calculate table index by multiplying r with table scale and truncate to integer */
1153 rt = _mm256_mul_ps(r20,vftabscale);
1154 vfitab = _mm256_cvttps_epi32(rt);
1155 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1156 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1157 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1158 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1159 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1160 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1162 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1163 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1164 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1165 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1166 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1167 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1168 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1169 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1170 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1171 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1172 Heps = _mm256_mul_ps(vfeps,H);
1173 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1174 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1175 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1179 /* Calculate temporary vectorial force */
1180 tx = _mm256_mul_ps(fscal,dx20);
1181 ty = _mm256_mul_ps(fscal,dy20);
1182 tz = _mm256_mul_ps(fscal,dz20);
1184 /* Update vectorial force */
1185 fix2 = _mm256_add_ps(fix2,tx);
1186 fiy2 = _mm256_add_ps(fiy2,ty);
1187 fiz2 = _mm256_add_ps(fiz2,tz);
1189 fjx0 = _mm256_add_ps(fjx0,tx);
1190 fjy0 = _mm256_add_ps(fjy0,ty);
1191 fjz0 = _mm256_add_ps(fjz0,tz);
1193 /**************************
1194 * CALCULATE INTERACTIONS *
1195 **************************/
1197 r30 = _mm256_mul_ps(rsq30,rinv30);
1199 /* Compute parameters for interactions between i and j atoms */
1200 qq30 = _mm256_mul_ps(iq3,jq0);
1202 /* Calculate table index by multiplying r with table scale and truncate to integer */
1203 rt = _mm256_mul_ps(r30,vftabscale);
1204 vfitab = _mm256_cvttps_epi32(rt);
1205 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1206 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1207 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1208 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1209 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1210 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1212 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1213 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1214 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1215 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1216 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1217 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1218 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1219 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1220 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1221 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1222 Heps = _mm256_mul_ps(vfeps,H);
1223 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1224 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1225 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq30,FF),_mm256_mul_ps(vftabscale,rinv30)));
1229 /* Calculate temporary vectorial force */
1230 tx = _mm256_mul_ps(fscal,dx30);
1231 ty = _mm256_mul_ps(fscal,dy30);
1232 tz = _mm256_mul_ps(fscal,dz30);
1234 /* Update vectorial force */
1235 fix3 = _mm256_add_ps(fix3,tx);
1236 fiy3 = _mm256_add_ps(fiy3,ty);
1237 fiz3 = _mm256_add_ps(fiz3,tz);
1239 fjx0 = _mm256_add_ps(fjx0,tx);
1240 fjy0 = _mm256_add_ps(fjy0,ty);
1241 fjz0 = _mm256_add_ps(fjz0,tz);
1243 fjptrA = f+j_coord_offsetA;
1244 fjptrB = f+j_coord_offsetB;
1245 fjptrC = f+j_coord_offsetC;
1246 fjptrD = f+j_coord_offsetD;
1247 fjptrE = f+j_coord_offsetE;
1248 fjptrF = f+j_coord_offsetF;
1249 fjptrG = f+j_coord_offsetG;
1250 fjptrH = f+j_coord_offsetH;
1252 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1254 /* Inner loop uses 147 flops */
1257 if(jidx<j_index_end)
1260 /* Get j neighbor index, and coordinate index */
1261 jnrlistA = jjnr[jidx];
1262 jnrlistB = jjnr[jidx+1];
1263 jnrlistC = jjnr[jidx+2];
1264 jnrlistD = jjnr[jidx+3];
1265 jnrlistE = jjnr[jidx+4];
1266 jnrlistF = jjnr[jidx+5];
1267 jnrlistG = jjnr[jidx+6];
1268 jnrlistH = jjnr[jidx+7];
1269 /* Sign of each element will be negative for non-real atoms.
1270 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1271 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1273 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1274 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1276 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1277 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1278 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1279 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1280 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1281 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1282 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1283 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1284 j_coord_offsetA = DIM*jnrA;
1285 j_coord_offsetB = DIM*jnrB;
1286 j_coord_offsetC = DIM*jnrC;
1287 j_coord_offsetD = DIM*jnrD;
1288 j_coord_offsetE = DIM*jnrE;
1289 j_coord_offsetF = DIM*jnrF;
1290 j_coord_offsetG = DIM*jnrG;
1291 j_coord_offsetH = DIM*jnrH;
1293 /* load j atom coordinates */
1294 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1295 x+j_coord_offsetC,x+j_coord_offsetD,
1296 x+j_coord_offsetE,x+j_coord_offsetF,
1297 x+j_coord_offsetG,x+j_coord_offsetH,
1300 /* Calculate displacement vector */
1301 dx00 = _mm256_sub_ps(ix0,jx0);
1302 dy00 = _mm256_sub_ps(iy0,jy0);
1303 dz00 = _mm256_sub_ps(iz0,jz0);
1304 dx10 = _mm256_sub_ps(ix1,jx0);
1305 dy10 = _mm256_sub_ps(iy1,jy0);
1306 dz10 = _mm256_sub_ps(iz1,jz0);
1307 dx20 = _mm256_sub_ps(ix2,jx0);
1308 dy20 = _mm256_sub_ps(iy2,jy0);
1309 dz20 = _mm256_sub_ps(iz2,jz0);
1310 dx30 = _mm256_sub_ps(ix3,jx0);
1311 dy30 = _mm256_sub_ps(iy3,jy0);
1312 dz30 = _mm256_sub_ps(iz3,jz0);
1314 /* Calculate squared distance and things based on it */
1315 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1316 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1317 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1318 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1320 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1321 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1322 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
1324 rinvsq00 = gmx_mm256_inv_ps(rsq00);
1326 /* Load parameters for j particles */
1327 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1328 charge+jnrC+0,charge+jnrD+0,
1329 charge+jnrE+0,charge+jnrF+0,
1330 charge+jnrG+0,charge+jnrH+0);
1331 vdwjidx0A = 2*vdwtype[jnrA+0];
1332 vdwjidx0B = 2*vdwtype[jnrB+0];
1333 vdwjidx0C = 2*vdwtype[jnrC+0];
1334 vdwjidx0D = 2*vdwtype[jnrD+0];
1335 vdwjidx0E = 2*vdwtype[jnrE+0];
1336 vdwjidx0F = 2*vdwtype[jnrF+0];
1337 vdwjidx0G = 2*vdwtype[jnrG+0];
1338 vdwjidx0H = 2*vdwtype[jnrH+0];
1340 fjx0 = _mm256_setzero_ps();
1341 fjy0 = _mm256_setzero_ps();
1342 fjz0 = _mm256_setzero_ps();
1344 /**************************
1345 * CALCULATE INTERACTIONS *
1346 **************************/
1348 /* Compute parameters for interactions between i and j atoms */
1349 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1350 vdwioffsetptr0+vdwjidx0B,
1351 vdwioffsetptr0+vdwjidx0C,
1352 vdwioffsetptr0+vdwjidx0D,
1353 vdwioffsetptr0+vdwjidx0E,
1354 vdwioffsetptr0+vdwjidx0F,
1355 vdwioffsetptr0+vdwjidx0G,
1356 vdwioffsetptr0+vdwjidx0H,
1359 /* LENNARD-JONES DISPERSION/REPULSION */
1361 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1362 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1366 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1368 /* Calculate temporary vectorial force */
1369 tx = _mm256_mul_ps(fscal,dx00);
1370 ty = _mm256_mul_ps(fscal,dy00);
1371 tz = _mm256_mul_ps(fscal,dz00);
1373 /* Update vectorial force */
1374 fix0 = _mm256_add_ps(fix0,tx);
1375 fiy0 = _mm256_add_ps(fiy0,ty);
1376 fiz0 = _mm256_add_ps(fiz0,tz);
1378 fjx0 = _mm256_add_ps(fjx0,tx);
1379 fjy0 = _mm256_add_ps(fjy0,ty);
1380 fjz0 = _mm256_add_ps(fjz0,tz);
1382 /**************************
1383 * CALCULATE INTERACTIONS *
1384 **************************/
1386 r10 = _mm256_mul_ps(rsq10,rinv10);
1387 r10 = _mm256_andnot_ps(dummy_mask,r10);
1389 /* Compute parameters for interactions between i and j atoms */
1390 qq10 = _mm256_mul_ps(iq1,jq0);
1392 /* Calculate table index by multiplying r with table scale and truncate to integer */
1393 rt = _mm256_mul_ps(r10,vftabscale);
1394 vfitab = _mm256_cvttps_epi32(rt);
1395 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1396 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1397 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1398 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1399 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1400 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1402 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1403 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1404 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1405 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1406 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1407 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1408 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1409 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1410 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1411 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1412 Heps = _mm256_mul_ps(vfeps,H);
1413 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1414 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1415 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1419 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1421 /* Calculate temporary vectorial force */
1422 tx = _mm256_mul_ps(fscal,dx10);
1423 ty = _mm256_mul_ps(fscal,dy10);
1424 tz = _mm256_mul_ps(fscal,dz10);
1426 /* Update vectorial force */
1427 fix1 = _mm256_add_ps(fix1,tx);
1428 fiy1 = _mm256_add_ps(fiy1,ty);
1429 fiz1 = _mm256_add_ps(fiz1,tz);
1431 fjx0 = _mm256_add_ps(fjx0,tx);
1432 fjy0 = _mm256_add_ps(fjy0,ty);
1433 fjz0 = _mm256_add_ps(fjz0,tz);
1435 /**************************
1436 * CALCULATE INTERACTIONS *
1437 **************************/
1439 r20 = _mm256_mul_ps(rsq20,rinv20);
1440 r20 = _mm256_andnot_ps(dummy_mask,r20);
1442 /* Compute parameters for interactions between i and j atoms */
1443 qq20 = _mm256_mul_ps(iq2,jq0);
1445 /* Calculate table index by multiplying r with table scale and truncate to integer */
1446 rt = _mm256_mul_ps(r20,vftabscale);
1447 vfitab = _mm256_cvttps_epi32(rt);
1448 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1449 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1450 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1451 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1452 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1453 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1455 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1456 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1457 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1458 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1459 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1460 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1461 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1462 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1463 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1464 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1465 Heps = _mm256_mul_ps(vfeps,H);
1466 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1467 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1468 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1472 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1474 /* Calculate temporary vectorial force */
1475 tx = _mm256_mul_ps(fscal,dx20);
1476 ty = _mm256_mul_ps(fscal,dy20);
1477 tz = _mm256_mul_ps(fscal,dz20);
1479 /* Update vectorial force */
1480 fix2 = _mm256_add_ps(fix2,tx);
1481 fiy2 = _mm256_add_ps(fiy2,ty);
1482 fiz2 = _mm256_add_ps(fiz2,tz);
1484 fjx0 = _mm256_add_ps(fjx0,tx);
1485 fjy0 = _mm256_add_ps(fjy0,ty);
1486 fjz0 = _mm256_add_ps(fjz0,tz);
1488 /**************************
1489 * CALCULATE INTERACTIONS *
1490 **************************/
1492 r30 = _mm256_mul_ps(rsq30,rinv30);
1493 r30 = _mm256_andnot_ps(dummy_mask,r30);
1495 /* Compute parameters for interactions between i and j atoms */
1496 qq30 = _mm256_mul_ps(iq3,jq0);
1498 /* Calculate table index by multiplying r with table scale and truncate to integer */
1499 rt = _mm256_mul_ps(r30,vftabscale);
1500 vfitab = _mm256_cvttps_epi32(rt);
1501 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1502 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1503 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1504 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1505 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1506 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1508 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1509 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1510 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1511 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1512 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1513 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1514 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1515 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1516 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1517 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1518 Heps = _mm256_mul_ps(vfeps,H);
1519 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1520 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1521 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq30,FF),_mm256_mul_ps(vftabscale,rinv30)));
1525 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1527 /* Calculate temporary vectorial force */
1528 tx = _mm256_mul_ps(fscal,dx30);
1529 ty = _mm256_mul_ps(fscal,dy30);
1530 tz = _mm256_mul_ps(fscal,dz30);
1532 /* Update vectorial force */
1533 fix3 = _mm256_add_ps(fix3,tx);
1534 fiy3 = _mm256_add_ps(fiy3,ty);
1535 fiz3 = _mm256_add_ps(fiz3,tz);
1537 fjx0 = _mm256_add_ps(fjx0,tx);
1538 fjy0 = _mm256_add_ps(fjy0,ty);
1539 fjz0 = _mm256_add_ps(fjz0,tz);
1541 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1542 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1543 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1544 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1545 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1546 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1547 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1548 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1550 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1552 /* Inner loop uses 150 flops */
1555 /* End of innermost loop */
1557 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1558 f+i_coord_offset,fshift+i_shift_offset);
1560 /* Increment number of inner iterations */
1561 inneriter += j_index_end - j_index_start;
1563 /* Outer loop uses 24 flops */
1566 /* Increment number of outer iterations */
1569 /* Update outer/inner flops */
1571 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*150);