2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 real * vdwioffsetptr3;
79 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
80 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
81 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
82 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
84 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
85 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
86 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
89 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
92 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
93 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
95 __m128i vfitab_lo,vfitab_hi;
96 __m128i ifour = _mm_set1_epi32(4);
97 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
99 __m256 dummy_mask,cutoff_mask;
100 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
101 __m256 one = _mm256_set1_ps(1.0);
102 __m256 two = _mm256_set1_ps(2.0);
108 jindex = nlist->jindex;
110 shiftidx = nlist->shift;
112 shiftvec = fr->shift_vec[0];
113 fshift = fr->fshift[0];
114 facel = _mm256_set1_ps(fr->epsfac);
115 charge = mdatoms->chargeA;
116 nvdwtype = fr->ntype;
118 vdwtype = mdatoms->typeA;
120 vftab = kernel_data->table_elec->data;
121 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
123 /* Setup water-specific parameters */
124 inr = nlist->iinr[0];
125 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
126 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
127 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
128 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
130 /* Avoid stupid compiler warnings */
131 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
144 for(iidx=0;iidx<4*DIM;iidx++)
149 /* Start outer loop over neighborlists */
150 for(iidx=0; iidx<nri; iidx++)
152 /* Load shift vector for this list */
153 i_shift_offset = DIM*shiftidx[iidx];
155 /* Load limits for loop over neighbors */
156 j_index_start = jindex[iidx];
157 j_index_end = jindex[iidx+1];
159 /* Get outer coordinate index */
161 i_coord_offset = DIM*inr;
163 /* Load i particle coords and add shift vector */
164 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
165 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
167 fix0 = _mm256_setzero_ps();
168 fiy0 = _mm256_setzero_ps();
169 fiz0 = _mm256_setzero_ps();
170 fix1 = _mm256_setzero_ps();
171 fiy1 = _mm256_setzero_ps();
172 fiz1 = _mm256_setzero_ps();
173 fix2 = _mm256_setzero_ps();
174 fiy2 = _mm256_setzero_ps();
175 fiz2 = _mm256_setzero_ps();
176 fix3 = _mm256_setzero_ps();
177 fiy3 = _mm256_setzero_ps();
178 fiz3 = _mm256_setzero_ps();
180 /* Reset potential sums */
181 velecsum = _mm256_setzero_ps();
182 vvdwsum = _mm256_setzero_ps();
184 /* Start inner kernel loop */
185 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
188 /* Get j neighbor index, and coordinate index */
197 j_coord_offsetA = DIM*jnrA;
198 j_coord_offsetB = DIM*jnrB;
199 j_coord_offsetC = DIM*jnrC;
200 j_coord_offsetD = DIM*jnrD;
201 j_coord_offsetE = DIM*jnrE;
202 j_coord_offsetF = DIM*jnrF;
203 j_coord_offsetG = DIM*jnrG;
204 j_coord_offsetH = DIM*jnrH;
206 /* load j atom coordinates */
207 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
208 x+j_coord_offsetC,x+j_coord_offsetD,
209 x+j_coord_offsetE,x+j_coord_offsetF,
210 x+j_coord_offsetG,x+j_coord_offsetH,
213 /* Calculate displacement vector */
214 dx00 = _mm256_sub_ps(ix0,jx0);
215 dy00 = _mm256_sub_ps(iy0,jy0);
216 dz00 = _mm256_sub_ps(iz0,jz0);
217 dx10 = _mm256_sub_ps(ix1,jx0);
218 dy10 = _mm256_sub_ps(iy1,jy0);
219 dz10 = _mm256_sub_ps(iz1,jz0);
220 dx20 = _mm256_sub_ps(ix2,jx0);
221 dy20 = _mm256_sub_ps(iy2,jy0);
222 dz20 = _mm256_sub_ps(iz2,jz0);
223 dx30 = _mm256_sub_ps(ix3,jx0);
224 dy30 = _mm256_sub_ps(iy3,jy0);
225 dz30 = _mm256_sub_ps(iz3,jz0);
227 /* Calculate squared distance and things based on it */
228 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
229 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
230 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
231 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
233 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
234 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
235 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
237 rinvsq00 = gmx_mm256_inv_ps(rsq00);
239 /* Load parameters for j particles */
240 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
241 charge+jnrC+0,charge+jnrD+0,
242 charge+jnrE+0,charge+jnrF+0,
243 charge+jnrG+0,charge+jnrH+0);
244 vdwjidx0A = 2*vdwtype[jnrA+0];
245 vdwjidx0B = 2*vdwtype[jnrB+0];
246 vdwjidx0C = 2*vdwtype[jnrC+0];
247 vdwjidx0D = 2*vdwtype[jnrD+0];
248 vdwjidx0E = 2*vdwtype[jnrE+0];
249 vdwjidx0F = 2*vdwtype[jnrF+0];
250 vdwjidx0G = 2*vdwtype[jnrG+0];
251 vdwjidx0H = 2*vdwtype[jnrH+0];
253 fjx0 = _mm256_setzero_ps();
254 fjy0 = _mm256_setzero_ps();
255 fjz0 = _mm256_setzero_ps();
257 /**************************
258 * CALCULATE INTERACTIONS *
259 **************************/
261 /* Compute parameters for interactions between i and j atoms */
262 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
263 vdwioffsetptr0+vdwjidx0B,
264 vdwioffsetptr0+vdwjidx0C,
265 vdwioffsetptr0+vdwjidx0D,
266 vdwioffsetptr0+vdwjidx0E,
267 vdwioffsetptr0+vdwjidx0F,
268 vdwioffsetptr0+vdwjidx0G,
269 vdwioffsetptr0+vdwjidx0H,
272 /* LENNARD-JONES DISPERSION/REPULSION */
274 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
275 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
276 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
277 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
278 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
280 /* Update potential sum for this i atom from the interaction with this j atom. */
281 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
285 /* Calculate temporary vectorial force */
286 tx = _mm256_mul_ps(fscal,dx00);
287 ty = _mm256_mul_ps(fscal,dy00);
288 tz = _mm256_mul_ps(fscal,dz00);
290 /* Update vectorial force */
291 fix0 = _mm256_add_ps(fix0,tx);
292 fiy0 = _mm256_add_ps(fiy0,ty);
293 fiz0 = _mm256_add_ps(fiz0,tz);
295 fjx0 = _mm256_add_ps(fjx0,tx);
296 fjy0 = _mm256_add_ps(fjy0,ty);
297 fjz0 = _mm256_add_ps(fjz0,tz);
299 /**************************
300 * CALCULATE INTERACTIONS *
301 **************************/
303 r10 = _mm256_mul_ps(rsq10,rinv10);
305 /* Compute parameters for interactions between i and j atoms */
306 qq10 = _mm256_mul_ps(iq1,jq0);
308 /* Calculate table index by multiplying r with table scale and truncate to integer */
309 rt = _mm256_mul_ps(r10,vftabscale);
310 vfitab = _mm256_cvttps_epi32(rt);
311 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
312 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
313 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
314 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
315 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
316 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
318 /* CUBIC SPLINE TABLE ELECTROSTATICS */
319 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
320 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
321 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
322 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
323 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
324 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
325 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
326 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
327 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
328 Heps = _mm256_mul_ps(vfeps,H);
329 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
330 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
331 velec = _mm256_mul_ps(qq10,VV);
332 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
333 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
335 /* Update potential sum for this i atom from the interaction with this j atom. */
336 velecsum = _mm256_add_ps(velecsum,velec);
340 /* Calculate temporary vectorial force */
341 tx = _mm256_mul_ps(fscal,dx10);
342 ty = _mm256_mul_ps(fscal,dy10);
343 tz = _mm256_mul_ps(fscal,dz10);
345 /* Update vectorial force */
346 fix1 = _mm256_add_ps(fix1,tx);
347 fiy1 = _mm256_add_ps(fiy1,ty);
348 fiz1 = _mm256_add_ps(fiz1,tz);
350 fjx0 = _mm256_add_ps(fjx0,tx);
351 fjy0 = _mm256_add_ps(fjy0,ty);
352 fjz0 = _mm256_add_ps(fjz0,tz);
354 /**************************
355 * CALCULATE INTERACTIONS *
356 **************************/
358 r20 = _mm256_mul_ps(rsq20,rinv20);
360 /* Compute parameters for interactions between i and j atoms */
361 qq20 = _mm256_mul_ps(iq2,jq0);
363 /* Calculate table index by multiplying r with table scale and truncate to integer */
364 rt = _mm256_mul_ps(r20,vftabscale);
365 vfitab = _mm256_cvttps_epi32(rt);
366 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
367 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
368 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
369 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
370 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
371 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
373 /* CUBIC SPLINE TABLE ELECTROSTATICS */
374 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
375 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
376 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
377 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
378 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
379 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
380 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
381 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
382 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
383 Heps = _mm256_mul_ps(vfeps,H);
384 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
385 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
386 velec = _mm256_mul_ps(qq20,VV);
387 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
388 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
390 /* Update potential sum for this i atom from the interaction with this j atom. */
391 velecsum = _mm256_add_ps(velecsum,velec);
395 /* Calculate temporary vectorial force */
396 tx = _mm256_mul_ps(fscal,dx20);
397 ty = _mm256_mul_ps(fscal,dy20);
398 tz = _mm256_mul_ps(fscal,dz20);
400 /* Update vectorial force */
401 fix2 = _mm256_add_ps(fix2,tx);
402 fiy2 = _mm256_add_ps(fiy2,ty);
403 fiz2 = _mm256_add_ps(fiz2,tz);
405 fjx0 = _mm256_add_ps(fjx0,tx);
406 fjy0 = _mm256_add_ps(fjy0,ty);
407 fjz0 = _mm256_add_ps(fjz0,tz);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 r30 = _mm256_mul_ps(rsq30,rinv30);
415 /* Compute parameters for interactions between i and j atoms */
416 qq30 = _mm256_mul_ps(iq3,jq0);
418 /* Calculate table index by multiplying r with table scale and truncate to integer */
419 rt = _mm256_mul_ps(r30,vftabscale);
420 vfitab = _mm256_cvttps_epi32(rt);
421 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
422 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
423 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
424 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
425 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
426 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
428 /* CUBIC SPLINE TABLE ELECTROSTATICS */
429 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
430 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
431 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
432 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
433 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
434 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
435 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
436 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
437 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
438 Heps = _mm256_mul_ps(vfeps,H);
439 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
440 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
441 velec = _mm256_mul_ps(qq30,VV);
442 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
443 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq30,FF),_mm256_mul_ps(vftabscale,rinv30)));
445 /* Update potential sum for this i atom from the interaction with this j atom. */
446 velecsum = _mm256_add_ps(velecsum,velec);
450 /* Calculate temporary vectorial force */
451 tx = _mm256_mul_ps(fscal,dx30);
452 ty = _mm256_mul_ps(fscal,dy30);
453 tz = _mm256_mul_ps(fscal,dz30);
455 /* Update vectorial force */
456 fix3 = _mm256_add_ps(fix3,tx);
457 fiy3 = _mm256_add_ps(fiy3,ty);
458 fiz3 = _mm256_add_ps(fiz3,tz);
460 fjx0 = _mm256_add_ps(fjx0,tx);
461 fjy0 = _mm256_add_ps(fjy0,ty);
462 fjz0 = _mm256_add_ps(fjz0,tz);
464 fjptrA = f+j_coord_offsetA;
465 fjptrB = f+j_coord_offsetB;
466 fjptrC = f+j_coord_offsetC;
467 fjptrD = f+j_coord_offsetD;
468 fjptrE = f+j_coord_offsetE;
469 fjptrF = f+j_coord_offsetF;
470 fjptrG = f+j_coord_offsetG;
471 fjptrH = f+j_coord_offsetH;
473 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
475 /* Inner loop uses 164 flops */
481 /* Get j neighbor index, and coordinate index */
482 jnrlistA = jjnr[jidx];
483 jnrlistB = jjnr[jidx+1];
484 jnrlistC = jjnr[jidx+2];
485 jnrlistD = jjnr[jidx+3];
486 jnrlistE = jjnr[jidx+4];
487 jnrlistF = jjnr[jidx+5];
488 jnrlistG = jjnr[jidx+6];
489 jnrlistH = jjnr[jidx+7];
490 /* Sign of each element will be negative for non-real atoms.
491 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
492 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
494 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
495 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
497 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
498 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
499 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
500 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
501 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
502 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
503 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
504 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
505 j_coord_offsetA = DIM*jnrA;
506 j_coord_offsetB = DIM*jnrB;
507 j_coord_offsetC = DIM*jnrC;
508 j_coord_offsetD = DIM*jnrD;
509 j_coord_offsetE = DIM*jnrE;
510 j_coord_offsetF = DIM*jnrF;
511 j_coord_offsetG = DIM*jnrG;
512 j_coord_offsetH = DIM*jnrH;
514 /* load j atom coordinates */
515 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
516 x+j_coord_offsetC,x+j_coord_offsetD,
517 x+j_coord_offsetE,x+j_coord_offsetF,
518 x+j_coord_offsetG,x+j_coord_offsetH,
521 /* Calculate displacement vector */
522 dx00 = _mm256_sub_ps(ix0,jx0);
523 dy00 = _mm256_sub_ps(iy0,jy0);
524 dz00 = _mm256_sub_ps(iz0,jz0);
525 dx10 = _mm256_sub_ps(ix1,jx0);
526 dy10 = _mm256_sub_ps(iy1,jy0);
527 dz10 = _mm256_sub_ps(iz1,jz0);
528 dx20 = _mm256_sub_ps(ix2,jx0);
529 dy20 = _mm256_sub_ps(iy2,jy0);
530 dz20 = _mm256_sub_ps(iz2,jz0);
531 dx30 = _mm256_sub_ps(ix3,jx0);
532 dy30 = _mm256_sub_ps(iy3,jy0);
533 dz30 = _mm256_sub_ps(iz3,jz0);
535 /* Calculate squared distance and things based on it */
536 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
537 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
538 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
539 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
541 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
542 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
543 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
545 rinvsq00 = gmx_mm256_inv_ps(rsq00);
547 /* Load parameters for j particles */
548 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
549 charge+jnrC+0,charge+jnrD+0,
550 charge+jnrE+0,charge+jnrF+0,
551 charge+jnrG+0,charge+jnrH+0);
552 vdwjidx0A = 2*vdwtype[jnrA+0];
553 vdwjidx0B = 2*vdwtype[jnrB+0];
554 vdwjidx0C = 2*vdwtype[jnrC+0];
555 vdwjidx0D = 2*vdwtype[jnrD+0];
556 vdwjidx0E = 2*vdwtype[jnrE+0];
557 vdwjidx0F = 2*vdwtype[jnrF+0];
558 vdwjidx0G = 2*vdwtype[jnrG+0];
559 vdwjidx0H = 2*vdwtype[jnrH+0];
561 fjx0 = _mm256_setzero_ps();
562 fjy0 = _mm256_setzero_ps();
563 fjz0 = _mm256_setzero_ps();
565 /**************************
566 * CALCULATE INTERACTIONS *
567 **************************/
569 /* Compute parameters for interactions between i and j atoms */
570 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
571 vdwioffsetptr0+vdwjidx0B,
572 vdwioffsetptr0+vdwjidx0C,
573 vdwioffsetptr0+vdwjidx0D,
574 vdwioffsetptr0+vdwjidx0E,
575 vdwioffsetptr0+vdwjidx0F,
576 vdwioffsetptr0+vdwjidx0G,
577 vdwioffsetptr0+vdwjidx0H,
580 /* LENNARD-JONES DISPERSION/REPULSION */
582 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
583 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
584 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
585 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
586 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
588 /* Update potential sum for this i atom from the interaction with this j atom. */
589 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
590 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
594 fscal = _mm256_andnot_ps(dummy_mask,fscal);
596 /* Calculate temporary vectorial force */
597 tx = _mm256_mul_ps(fscal,dx00);
598 ty = _mm256_mul_ps(fscal,dy00);
599 tz = _mm256_mul_ps(fscal,dz00);
601 /* Update vectorial force */
602 fix0 = _mm256_add_ps(fix0,tx);
603 fiy0 = _mm256_add_ps(fiy0,ty);
604 fiz0 = _mm256_add_ps(fiz0,tz);
606 fjx0 = _mm256_add_ps(fjx0,tx);
607 fjy0 = _mm256_add_ps(fjy0,ty);
608 fjz0 = _mm256_add_ps(fjz0,tz);
610 /**************************
611 * CALCULATE INTERACTIONS *
612 **************************/
614 r10 = _mm256_mul_ps(rsq10,rinv10);
615 r10 = _mm256_andnot_ps(dummy_mask,r10);
617 /* Compute parameters for interactions between i and j atoms */
618 qq10 = _mm256_mul_ps(iq1,jq0);
620 /* Calculate table index by multiplying r with table scale and truncate to integer */
621 rt = _mm256_mul_ps(r10,vftabscale);
622 vfitab = _mm256_cvttps_epi32(rt);
623 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
624 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
625 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
626 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
627 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
628 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
630 /* CUBIC SPLINE TABLE ELECTROSTATICS */
631 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
632 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
633 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
634 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
635 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
636 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
637 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
638 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
639 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
640 Heps = _mm256_mul_ps(vfeps,H);
641 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
642 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
643 velec = _mm256_mul_ps(qq10,VV);
644 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
645 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
647 /* Update potential sum for this i atom from the interaction with this j atom. */
648 velec = _mm256_andnot_ps(dummy_mask,velec);
649 velecsum = _mm256_add_ps(velecsum,velec);
653 fscal = _mm256_andnot_ps(dummy_mask,fscal);
655 /* Calculate temporary vectorial force */
656 tx = _mm256_mul_ps(fscal,dx10);
657 ty = _mm256_mul_ps(fscal,dy10);
658 tz = _mm256_mul_ps(fscal,dz10);
660 /* Update vectorial force */
661 fix1 = _mm256_add_ps(fix1,tx);
662 fiy1 = _mm256_add_ps(fiy1,ty);
663 fiz1 = _mm256_add_ps(fiz1,tz);
665 fjx0 = _mm256_add_ps(fjx0,tx);
666 fjy0 = _mm256_add_ps(fjy0,ty);
667 fjz0 = _mm256_add_ps(fjz0,tz);
669 /**************************
670 * CALCULATE INTERACTIONS *
671 **************************/
673 r20 = _mm256_mul_ps(rsq20,rinv20);
674 r20 = _mm256_andnot_ps(dummy_mask,r20);
676 /* Compute parameters for interactions between i and j atoms */
677 qq20 = _mm256_mul_ps(iq2,jq0);
679 /* Calculate table index by multiplying r with table scale and truncate to integer */
680 rt = _mm256_mul_ps(r20,vftabscale);
681 vfitab = _mm256_cvttps_epi32(rt);
682 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
683 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
684 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
685 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
686 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
687 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
689 /* CUBIC SPLINE TABLE ELECTROSTATICS */
690 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
691 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
692 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
693 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
694 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
695 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
696 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
697 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
698 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
699 Heps = _mm256_mul_ps(vfeps,H);
700 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
701 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
702 velec = _mm256_mul_ps(qq20,VV);
703 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
704 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
706 /* Update potential sum for this i atom from the interaction with this j atom. */
707 velec = _mm256_andnot_ps(dummy_mask,velec);
708 velecsum = _mm256_add_ps(velecsum,velec);
712 fscal = _mm256_andnot_ps(dummy_mask,fscal);
714 /* Calculate temporary vectorial force */
715 tx = _mm256_mul_ps(fscal,dx20);
716 ty = _mm256_mul_ps(fscal,dy20);
717 tz = _mm256_mul_ps(fscal,dz20);
719 /* Update vectorial force */
720 fix2 = _mm256_add_ps(fix2,tx);
721 fiy2 = _mm256_add_ps(fiy2,ty);
722 fiz2 = _mm256_add_ps(fiz2,tz);
724 fjx0 = _mm256_add_ps(fjx0,tx);
725 fjy0 = _mm256_add_ps(fjy0,ty);
726 fjz0 = _mm256_add_ps(fjz0,tz);
728 /**************************
729 * CALCULATE INTERACTIONS *
730 **************************/
732 r30 = _mm256_mul_ps(rsq30,rinv30);
733 r30 = _mm256_andnot_ps(dummy_mask,r30);
735 /* Compute parameters for interactions between i and j atoms */
736 qq30 = _mm256_mul_ps(iq3,jq0);
738 /* Calculate table index by multiplying r with table scale and truncate to integer */
739 rt = _mm256_mul_ps(r30,vftabscale);
740 vfitab = _mm256_cvttps_epi32(rt);
741 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
742 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
743 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
744 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
745 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
746 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
748 /* CUBIC SPLINE TABLE ELECTROSTATICS */
749 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
750 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
751 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
752 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
753 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
754 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
755 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
756 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
757 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
758 Heps = _mm256_mul_ps(vfeps,H);
759 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
760 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
761 velec = _mm256_mul_ps(qq30,VV);
762 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
763 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq30,FF),_mm256_mul_ps(vftabscale,rinv30)));
765 /* Update potential sum for this i atom from the interaction with this j atom. */
766 velec = _mm256_andnot_ps(dummy_mask,velec);
767 velecsum = _mm256_add_ps(velecsum,velec);
771 fscal = _mm256_andnot_ps(dummy_mask,fscal);
773 /* Calculate temporary vectorial force */
774 tx = _mm256_mul_ps(fscal,dx30);
775 ty = _mm256_mul_ps(fscal,dy30);
776 tz = _mm256_mul_ps(fscal,dz30);
778 /* Update vectorial force */
779 fix3 = _mm256_add_ps(fix3,tx);
780 fiy3 = _mm256_add_ps(fiy3,ty);
781 fiz3 = _mm256_add_ps(fiz3,tz);
783 fjx0 = _mm256_add_ps(fjx0,tx);
784 fjy0 = _mm256_add_ps(fjy0,ty);
785 fjz0 = _mm256_add_ps(fjz0,tz);
787 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
788 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
789 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
790 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
791 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
792 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
793 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
794 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
796 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
798 /* Inner loop uses 167 flops */
801 /* End of innermost loop */
803 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
804 f+i_coord_offset,fshift+i_shift_offset);
807 /* Update potential energies */
808 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
809 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
811 /* Increment number of inner iterations */
812 inneriter += j_index_end - j_index_start;
814 /* Outer loop uses 26 flops */
817 /* Increment number of outer iterations */
820 /* Update outer/inner flops */
822 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*167);
825 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_single
826 * Electrostatics interaction: CubicSplineTable
827 * VdW interaction: LennardJones
828 * Geometry: Water4-Particle
829 * Calculate force/pot: Force
832 nb_kernel_ElecCSTab_VdwLJ_GeomW4P1_F_avx_256_single
833 (t_nblist * gmx_restrict nlist,
834 rvec * gmx_restrict xx,
835 rvec * gmx_restrict ff,
836 t_forcerec * gmx_restrict fr,
837 t_mdatoms * gmx_restrict mdatoms,
838 nb_kernel_data_t * gmx_restrict kernel_data,
839 t_nrnb * gmx_restrict nrnb)
841 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
842 * just 0 for non-waters.
843 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
844 * jnr indices corresponding to data put in the four positions in the SIMD register.
846 int i_shift_offset,i_coord_offset,outeriter,inneriter;
847 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
848 int jnrA,jnrB,jnrC,jnrD;
849 int jnrE,jnrF,jnrG,jnrH;
850 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
851 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
852 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
853 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
854 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
856 real *shiftvec,*fshift,*x,*f;
857 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
859 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
860 real * vdwioffsetptr0;
861 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
862 real * vdwioffsetptr1;
863 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
864 real * vdwioffsetptr2;
865 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
866 real * vdwioffsetptr3;
867 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
868 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
869 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
870 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
871 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
872 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
873 __m256 dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
874 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
877 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
880 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
881 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
883 __m128i vfitab_lo,vfitab_hi;
884 __m128i ifour = _mm_set1_epi32(4);
885 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
887 __m256 dummy_mask,cutoff_mask;
888 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
889 __m256 one = _mm256_set1_ps(1.0);
890 __m256 two = _mm256_set1_ps(2.0);
896 jindex = nlist->jindex;
898 shiftidx = nlist->shift;
900 shiftvec = fr->shift_vec[0];
901 fshift = fr->fshift[0];
902 facel = _mm256_set1_ps(fr->epsfac);
903 charge = mdatoms->chargeA;
904 nvdwtype = fr->ntype;
906 vdwtype = mdatoms->typeA;
908 vftab = kernel_data->table_elec->data;
909 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
911 /* Setup water-specific parameters */
912 inr = nlist->iinr[0];
913 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
914 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
915 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
916 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
918 /* Avoid stupid compiler warnings */
919 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
932 for(iidx=0;iidx<4*DIM;iidx++)
937 /* Start outer loop over neighborlists */
938 for(iidx=0; iidx<nri; iidx++)
940 /* Load shift vector for this list */
941 i_shift_offset = DIM*shiftidx[iidx];
943 /* Load limits for loop over neighbors */
944 j_index_start = jindex[iidx];
945 j_index_end = jindex[iidx+1];
947 /* Get outer coordinate index */
949 i_coord_offset = DIM*inr;
951 /* Load i particle coords and add shift vector */
952 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
953 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
955 fix0 = _mm256_setzero_ps();
956 fiy0 = _mm256_setzero_ps();
957 fiz0 = _mm256_setzero_ps();
958 fix1 = _mm256_setzero_ps();
959 fiy1 = _mm256_setzero_ps();
960 fiz1 = _mm256_setzero_ps();
961 fix2 = _mm256_setzero_ps();
962 fiy2 = _mm256_setzero_ps();
963 fiz2 = _mm256_setzero_ps();
964 fix3 = _mm256_setzero_ps();
965 fiy3 = _mm256_setzero_ps();
966 fiz3 = _mm256_setzero_ps();
968 /* Start inner kernel loop */
969 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
972 /* Get j neighbor index, and coordinate index */
981 j_coord_offsetA = DIM*jnrA;
982 j_coord_offsetB = DIM*jnrB;
983 j_coord_offsetC = DIM*jnrC;
984 j_coord_offsetD = DIM*jnrD;
985 j_coord_offsetE = DIM*jnrE;
986 j_coord_offsetF = DIM*jnrF;
987 j_coord_offsetG = DIM*jnrG;
988 j_coord_offsetH = DIM*jnrH;
990 /* load j atom coordinates */
991 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
992 x+j_coord_offsetC,x+j_coord_offsetD,
993 x+j_coord_offsetE,x+j_coord_offsetF,
994 x+j_coord_offsetG,x+j_coord_offsetH,
997 /* Calculate displacement vector */
998 dx00 = _mm256_sub_ps(ix0,jx0);
999 dy00 = _mm256_sub_ps(iy0,jy0);
1000 dz00 = _mm256_sub_ps(iz0,jz0);
1001 dx10 = _mm256_sub_ps(ix1,jx0);
1002 dy10 = _mm256_sub_ps(iy1,jy0);
1003 dz10 = _mm256_sub_ps(iz1,jz0);
1004 dx20 = _mm256_sub_ps(ix2,jx0);
1005 dy20 = _mm256_sub_ps(iy2,jy0);
1006 dz20 = _mm256_sub_ps(iz2,jz0);
1007 dx30 = _mm256_sub_ps(ix3,jx0);
1008 dy30 = _mm256_sub_ps(iy3,jy0);
1009 dz30 = _mm256_sub_ps(iz3,jz0);
1011 /* Calculate squared distance and things based on it */
1012 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1013 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1014 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1015 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1017 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1018 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1019 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
1021 rinvsq00 = gmx_mm256_inv_ps(rsq00);
1023 /* Load parameters for j particles */
1024 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1025 charge+jnrC+0,charge+jnrD+0,
1026 charge+jnrE+0,charge+jnrF+0,
1027 charge+jnrG+0,charge+jnrH+0);
1028 vdwjidx0A = 2*vdwtype[jnrA+0];
1029 vdwjidx0B = 2*vdwtype[jnrB+0];
1030 vdwjidx0C = 2*vdwtype[jnrC+0];
1031 vdwjidx0D = 2*vdwtype[jnrD+0];
1032 vdwjidx0E = 2*vdwtype[jnrE+0];
1033 vdwjidx0F = 2*vdwtype[jnrF+0];
1034 vdwjidx0G = 2*vdwtype[jnrG+0];
1035 vdwjidx0H = 2*vdwtype[jnrH+0];
1037 fjx0 = _mm256_setzero_ps();
1038 fjy0 = _mm256_setzero_ps();
1039 fjz0 = _mm256_setzero_ps();
1041 /**************************
1042 * CALCULATE INTERACTIONS *
1043 **************************/
1045 /* Compute parameters for interactions between i and j atoms */
1046 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1047 vdwioffsetptr0+vdwjidx0B,
1048 vdwioffsetptr0+vdwjidx0C,
1049 vdwioffsetptr0+vdwjidx0D,
1050 vdwioffsetptr0+vdwjidx0E,
1051 vdwioffsetptr0+vdwjidx0F,
1052 vdwioffsetptr0+vdwjidx0G,
1053 vdwioffsetptr0+vdwjidx0H,
1056 /* LENNARD-JONES DISPERSION/REPULSION */
1058 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1059 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1063 /* Calculate temporary vectorial force */
1064 tx = _mm256_mul_ps(fscal,dx00);
1065 ty = _mm256_mul_ps(fscal,dy00);
1066 tz = _mm256_mul_ps(fscal,dz00);
1068 /* Update vectorial force */
1069 fix0 = _mm256_add_ps(fix0,tx);
1070 fiy0 = _mm256_add_ps(fiy0,ty);
1071 fiz0 = _mm256_add_ps(fiz0,tz);
1073 fjx0 = _mm256_add_ps(fjx0,tx);
1074 fjy0 = _mm256_add_ps(fjy0,ty);
1075 fjz0 = _mm256_add_ps(fjz0,tz);
1077 /**************************
1078 * CALCULATE INTERACTIONS *
1079 **************************/
1081 r10 = _mm256_mul_ps(rsq10,rinv10);
1083 /* Compute parameters for interactions between i and j atoms */
1084 qq10 = _mm256_mul_ps(iq1,jq0);
1086 /* Calculate table index by multiplying r with table scale and truncate to integer */
1087 rt = _mm256_mul_ps(r10,vftabscale);
1088 vfitab = _mm256_cvttps_epi32(rt);
1089 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1090 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1091 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1092 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1093 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1094 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1096 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1097 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1098 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1099 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1100 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1101 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1102 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1103 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1104 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1105 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1106 Heps = _mm256_mul_ps(vfeps,H);
1107 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1108 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1109 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1113 /* Calculate temporary vectorial force */
1114 tx = _mm256_mul_ps(fscal,dx10);
1115 ty = _mm256_mul_ps(fscal,dy10);
1116 tz = _mm256_mul_ps(fscal,dz10);
1118 /* Update vectorial force */
1119 fix1 = _mm256_add_ps(fix1,tx);
1120 fiy1 = _mm256_add_ps(fiy1,ty);
1121 fiz1 = _mm256_add_ps(fiz1,tz);
1123 fjx0 = _mm256_add_ps(fjx0,tx);
1124 fjy0 = _mm256_add_ps(fjy0,ty);
1125 fjz0 = _mm256_add_ps(fjz0,tz);
1127 /**************************
1128 * CALCULATE INTERACTIONS *
1129 **************************/
1131 r20 = _mm256_mul_ps(rsq20,rinv20);
1133 /* Compute parameters for interactions between i and j atoms */
1134 qq20 = _mm256_mul_ps(iq2,jq0);
1136 /* Calculate table index by multiplying r with table scale and truncate to integer */
1137 rt = _mm256_mul_ps(r20,vftabscale);
1138 vfitab = _mm256_cvttps_epi32(rt);
1139 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1140 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1141 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1142 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1143 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1144 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1146 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1147 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1148 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1149 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1150 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1151 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1152 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1153 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1154 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1155 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1156 Heps = _mm256_mul_ps(vfeps,H);
1157 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1158 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1159 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1163 /* Calculate temporary vectorial force */
1164 tx = _mm256_mul_ps(fscal,dx20);
1165 ty = _mm256_mul_ps(fscal,dy20);
1166 tz = _mm256_mul_ps(fscal,dz20);
1168 /* Update vectorial force */
1169 fix2 = _mm256_add_ps(fix2,tx);
1170 fiy2 = _mm256_add_ps(fiy2,ty);
1171 fiz2 = _mm256_add_ps(fiz2,tz);
1173 fjx0 = _mm256_add_ps(fjx0,tx);
1174 fjy0 = _mm256_add_ps(fjy0,ty);
1175 fjz0 = _mm256_add_ps(fjz0,tz);
1177 /**************************
1178 * CALCULATE INTERACTIONS *
1179 **************************/
1181 r30 = _mm256_mul_ps(rsq30,rinv30);
1183 /* Compute parameters for interactions between i and j atoms */
1184 qq30 = _mm256_mul_ps(iq3,jq0);
1186 /* Calculate table index by multiplying r with table scale and truncate to integer */
1187 rt = _mm256_mul_ps(r30,vftabscale);
1188 vfitab = _mm256_cvttps_epi32(rt);
1189 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1190 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1191 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1192 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1193 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1194 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1196 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1197 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1198 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1199 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1200 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1201 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1202 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1203 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1204 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1205 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1206 Heps = _mm256_mul_ps(vfeps,H);
1207 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1208 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1209 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq30,FF),_mm256_mul_ps(vftabscale,rinv30)));
1213 /* Calculate temporary vectorial force */
1214 tx = _mm256_mul_ps(fscal,dx30);
1215 ty = _mm256_mul_ps(fscal,dy30);
1216 tz = _mm256_mul_ps(fscal,dz30);
1218 /* Update vectorial force */
1219 fix3 = _mm256_add_ps(fix3,tx);
1220 fiy3 = _mm256_add_ps(fiy3,ty);
1221 fiz3 = _mm256_add_ps(fiz3,tz);
1223 fjx0 = _mm256_add_ps(fjx0,tx);
1224 fjy0 = _mm256_add_ps(fjy0,ty);
1225 fjz0 = _mm256_add_ps(fjz0,tz);
1227 fjptrA = f+j_coord_offsetA;
1228 fjptrB = f+j_coord_offsetB;
1229 fjptrC = f+j_coord_offsetC;
1230 fjptrD = f+j_coord_offsetD;
1231 fjptrE = f+j_coord_offsetE;
1232 fjptrF = f+j_coord_offsetF;
1233 fjptrG = f+j_coord_offsetG;
1234 fjptrH = f+j_coord_offsetH;
1236 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1238 /* Inner loop uses 147 flops */
1241 if(jidx<j_index_end)
1244 /* Get j neighbor index, and coordinate index */
1245 jnrlistA = jjnr[jidx];
1246 jnrlistB = jjnr[jidx+1];
1247 jnrlistC = jjnr[jidx+2];
1248 jnrlistD = jjnr[jidx+3];
1249 jnrlistE = jjnr[jidx+4];
1250 jnrlistF = jjnr[jidx+5];
1251 jnrlistG = jjnr[jidx+6];
1252 jnrlistH = jjnr[jidx+7];
1253 /* Sign of each element will be negative for non-real atoms.
1254 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1255 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1257 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1258 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1260 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1261 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1262 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1263 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1264 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1265 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1266 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1267 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1268 j_coord_offsetA = DIM*jnrA;
1269 j_coord_offsetB = DIM*jnrB;
1270 j_coord_offsetC = DIM*jnrC;
1271 j_coord_offsetD = DIM*jnrD;
1272 j_coord_offsetE = DIM*jnrE;
1273 j_coord_offsetF = DIM*jnrF;
1274 j_coord_offsetG = DIM*jnrG;
1275 j_coord_offsetH = DIM*jnrH;
1277 /* load j atom coordinates */
1278 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1279 x+j_coord_offsetC,x+j_coord_offsetD,
1280 x+j_coord_offsetE,x+j_coord_offsetF,
1281 x+j_coord_offsetG,x+j_coord_offsetH,
1284 /* Calculate displacement vector */
1285 dx00 = _mm256_sub_ps(ix0,jx0);
1286 dy00 = _mm256_sub_ps(iy0,jy0);
1287 dz00 = _mm256_sub_ps(iz0,jz0);
1288 dx10 = _mm256_sub_ps(ix1,jx0);
1289 dy10 = _mm256_sub_ps(iy1,jy0);
1290 dz10 = _mm256_sub_ps(iz1,jz0);
1291 dx20 = _mm256_sub_ps(ix2,jx0);
1292 dy20 = _mm256_sub_ps(iy2,jy0);
1293 dz20 = _mm256_sub_ps(iz2,jz0);
1294 dx30 = _mm256_sub_ps(ix3,jx0);
1295 dy30 = _mm256_sub_ps(iy3,jy0);
1296 dz30 = _mm256_sub_ps(iz3,jz0);
1298 /* Calculate squared distance and things based on it */
1299 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1300 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1301 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1302 rsq30 = gmx_mm256_calc_rsq_ps(dx30,dy30,dz30);
1304 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1305 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1306 rinv30 = gmx_mm256_invsqrt_ps(rsq30);
1308 rinvsq00 = gmx_mm256_inv_ps(rsq00);
1310 /* Load parameters for j particles */
1311 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1312 charge+jnrC+0,charge+jnrD+0,
1313 charge+jnrE+0,charge+jnrF+0,
1314 charge+jnrG+0,charge+jnrH+0);
1315 vdwjidx0A = 2*vdwtype[jnrA+0];
1316 vdwjidx0B = 2*vdwtype[jnrB+0];
1317 vdwjidx0C = 2*vdwtype[jnrC+0];
1318 vdwjidx0D = 2*vdwtype[jnrD+0];
1319 vdwjidx0E = 2*vdwtype[jnrE+0];
1320 vdwjidx0F = 2*vdwtype[jnrF+0];
1321 vdwjidx0G = 2*vdwtype[jnrG+0];
1322 vdwjidx0H = 2*vdwtype[jnrH+0];
1324 fjx0 = _mm256_setzero_ps();
1325 fjy0 = _mm256_setzero_ps();
1326 fjz0 = _mm256_setzero_ps();
1328 /**************************
1329 * CALCULATE INTERACTIONS *
1330 **************************/
1332 /* Compute parameters for interactions between i and j atoms */
1333 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1334 vdwioffsetptr0+vdwjidx0B,
1335 vdwioffsetptr0+vdwjidx0C,
1336 vdwioffsetptr0+vdwjidx0D,
1337 vdwioffsetptr0+vdwjidx0E,
1338 vdwioffsetptr0+vdwjidx0F,
1339 vdwioffsetptr0+vdwjidx0G,
1340 vdwioffsetptr0+vdwjidx0H,
1343 /* LENNARD-JONES DISPERSION/REPULSION */
1345 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1346 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1350 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1352 /* Calculate temporary vectorial force */
1353 tx = _mm256_mul_ps(fscal,dx00);
1354 ty = _mm256_mul_ps(fscal,dy00);
1355 tz = _mm256_mul_ps(fscal,dz00);
1357 /* Update vectorial force */
1358 fix0 = _mm256_add_ps(fix0,tx);
1359 fiy0 = _mm256_add_ps(fiy0,ty);
1360 fiz0 = _mm256_add_ps(fiz0,tz);
1362 fjx0 = _mm256_add_ps(fjx0,tx);
1363 fjy0 = _mm256_add_ps(fjy0,ty);
1364 fjz0 = _mm256_add_ps(fjz0,tz);
1366 /**************************
1367 * CALCULATE INTERACTIONS *
1368 **************************/
1370 r10 = _mm256_mul_ps(rsq10,rinv10);
1371 r10 = _mm256_andnot_ps(dummy_mask,r10);
1373 /* Compute parameters for interactions between i and j atoms */
1374 qq10 = _mm256_mul_ps(iq1,jq0);
1376 /* Calculate table index by multiplying r with table scale and truncate to integer */
1377 rt = _mm256_mul_ps(r10,vftabscale);
1378 vfitab = _mm256_cvttps_epi32(rt);
1379 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1380 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1381 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1382 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1383 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1384 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1386 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1387 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1388 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1389 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1390 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1391 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1392 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1393 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1394 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1395 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1396 Heps = _mm256_mul_ps(vfeps,H);
1397 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1398 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1399 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1403 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1405 /* Calculate temporary vectorial force */
1406 tx = _mm256_mul_ps(fscal,dx10);
1407 ty = _mm256_mul_ps(fscal,dy10);
1408 tz = _mm256_mul_ps(fscal,dz10);
1410 /* Update vectorial force */
1411 fix1 = _mm256_add_ps(fix1,tx);
1412 fiy1 = _mm256_add_ps(fiy1,ty);
1413 fiz1 = _mm256_add_ps(fiz1,tz);
1415 fjx0 = _mm256_add_ps(fjx0,tx);
1416 fjy0 = _mm256_add_ps(fjy0,ty);
1417 fjz0 = _mm256_add_ps(fjz0,tz);
1419 /**************************
1420 * CALCULATE INTERACTIONS *
1421 **************************/
1423 r20 = _mm256_mul_ps(rsq20,rinv20);
1424 r20 = _mm256_andnot_ps(dummy_mask,r20);
1426 /* Compute parameters for interactions between i and j atoms */
1427 qq20 = _mm256_mul_ps(iq2,jq0);
1429 /* Calculate table index by multiplying r with table scale and truncate to integer */
1430 rt = _mm256_mul_ps(r20,vftabscale);
1431 vfitab = _mm256_cvttps_epi32(rt);
1432 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1433 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1434 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1435 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1436 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1437 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1439 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1440 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1441 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1442 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1443 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1444 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1445 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1446 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1447 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1448 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1449 Heps = _mm256_mul_ps(vfeps,H);
1450 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1451 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1452 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1456 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1458 /* Calculate temporary vectorial force */
1459 tx = _mm256_mul_ps(fscal,dx20);
1460 ty = _mm256_mul_ps(fscal,dy20);
1461 tz = _mm256_mul_ps(fscal,dz20);
1463 /* Update vectorial force */
1464 fix2 = _mm256_add_ps(fix2,tx);
1465 fiy2 = _mm256_add_ps(fiy2,ty);
1466 fiz2 = _mm256_add_ps(fiz2,tz);
1468 fjx0 = _mm256_add_ps(fjx0,tx);
1469 fjy0 = _mm256_add_ps(fjy0,ty);
1470 fjz0 = _mm256_add_ps(fjz0,tz);
1472 /**************************
1473 * CALCULATE INTERACTIONS *
1474 **************************/
1476 r30 = _mm256_mul_ps(rsq30,rinv30);
1477 r30 = _mm256_andnot_ps(dummy_mask,r30);
1479 /* Compute parameters for interactions between i and j atoms */
1480 qq30 = _mm256_mul_ps(iq3,jq0);
1482 /* Calculate table index by multiplying r with table scale and truncate to integer */
1483 rt = _mm256_mul_ps(r30,vftabscale);
1484 vfitab = _mm256_cvttps_epi32(rt);
1485 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1486 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1487 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1488 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1489 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1490 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1492 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1493 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1494 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1495 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1496 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1497 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1498 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1499 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1500 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1501 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1502 Heps = _mm256_mul_ps(vfeps,H);
1503 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1504 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1505 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq30,FF),_mm256_mul_ps(vftabscale,rinv30)));
1509 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1511 /* Calculate temporary vectorial force */
1512 tx = _mm256_mul_ps(fscal,dx30);
1513 ty = _mm256_mul_ps(fscal,dy30);
1514 tz = _mm256_mul_ps(fscal,dz30);
1516 /* Update vectorial force */
1517 fix3 = _mm256_add_ps(fix3,tx);
1518 fiy3 = _mm256_add_ps(fiy3,ty);
1519 fiz3 = _mm256_add_ps(fiz3,tz);
1521 fjx0 = _mm256_add_ps(fjx0,tx);
1522 fjy0 = _mm256_add_ps(fjy0,ty);
1523 fjz0 = _mm256_add_ps(fjz0,tz);
1525 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1526 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1527 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1528 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1529 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1530 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1531 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1532 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1534 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1536 /* Inner loop uses 150 flops */
1539 /* End of innermost loop */
1541 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1542 f+i_coord_offset,fshift+i_shift_offset);
1544 /* Increment number of inner iterations */
1545 inneriter += j_index_end - j_index_start;
1547 /* Outer loop uses 24 flops */
1550 /* Increment number of outer iterations */
1553 /* Update outer/inner flops */
1555 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*150);