2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_256_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: CubicSplineTable
40 * Geometry: Particle-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
75 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
77 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
80 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
83 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
84 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
86 __m128i vfitab_lo,vfitab_hi;
87 __m128i ifour = _mm_set1_epi32(4);
88 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
90 __m256 dummy_mask,cutoff_mask;
91 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
92 __m256 one = _mm256_set1_ps(1.0);
93 __m256 two = _mm256_set1_ps(2.0);
99 jindex = nlist->jindex;
101 shiftidx = nlist->shift;
103 shiftvec = fr->shift_vec[0];
104 fshift = fr->fshift[0];
105 facel = _mm256_set1_ps(fr->epsfac);
106 charge = mdatoms->chargeA;
107 nvdwtype = fr->ntype;
109 vdwtype = mdatoms->typeA;
111 vftab = kernel_data->table_elec_vdw->data;
112 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
114 /* Avoid stupid compiler warnings */
115 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
128 for(iidx=0;iidx<4*DIM;iidx++)
133 /* Start outer loop over neighborlists */
134 for(iidx=0; iidx<nri; iidx++)
136 /* Load shift vector for this list */
137 i_shift_offset = DIM*shiftidx[iidx];
139 /* Load limits for loop over neighbors */
140 j_index_start = jindex[iidx];
141 j_index_end = jindex[iidx+1];
143 /* Get outer coordinate index */
145 i_coord_offset = DIM*inr;
147 /* Load i particle coords and add shift vector */
148 gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
150 fix0 = _mm256_setzero_ps();
151 fiy0 = _mm256_setzero_ps();
152 fiz0 = _mm256_setzero_ps();
154 /* Load parameters for i particles */
155 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
156 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
158 /* Reset potential sums */
159 velecsum = _mm256_setzero_ps();
160 vvdwsum = _mm256_setzero_ps();
162 /* Start inner kernel loop */
163 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
166 /* Get j neighbor index, and coordinate index */
175 j_coord_offsetA = DIM*jnrA;
176 j_coord_offsetB = DIM*jnrB;
177 j_coord_offsetC = DIM*jnrC;
178 j_coord_offsetD = DIM*jnrD;
179 j_coord_offsetE = DIM*jnrE;
180 j_coord_offsetF = DIM*jnrF;
181 j_coord_offsetG = DIM*jnrG;
182 j_coord_offsetH = DIM*jnrH;
184 /* load j atom coordinates */
185 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
186 x+j_coord_offsetC,x+j_coord_offsetD,
187 x+j_coord_offsetE,x+j_coord_offsetF,
188 x+j_coord_offsetG,x+j_coord_offsetH,
191 /* Calculate displacement vector */
192 dx00 = _mm256_sub_ps(ix0,jx0);
193 dy00 = _mm256_sub_ps(iy0,jy0);
194 dz00 = _mm256_sub_ps(iz0,jz0);
196 /* Calculate squared distance and things based on it */
197 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
199 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
201 /* Load parameters for j particles */
202 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
203 charge+jnrC+0,charge+jnrD+0,
204 charge+jnrE+0,charge+jnrF+0,
205 charge+jnrG+0,charge+jnrH+0);
206 vdwjidx0A = 2*vdwtype[jnrA+0];
207 vdwjidx0B = 2*vdwtype[jnrB+0];
208 vdwjidx0C = 2*vdwtype[jnrC+0];
209 vdwjidx0D = 2*vdwtype[jnrD+0];
210 vdwjidx0E = 2*vdwtype[jnrE+0];
211 vdwjidx0F = 2*vdwtype[jnrF+0];
212 vdwjidx0G = 2*vdwtype[jnrG+0];
213 vdwjidx0H = 2*vdwtype[jnrH+0];
215 /**************************
216 * CALCULATE INTERACTIONS *
217 **************************/
219 r00 = _mm256_mul_ps(rsq00,rinv00);
221 /* Compute parameters for interactions between i and j atoms */
222 qq00 = _mm256_mul_ps(iq0,jq0);
223 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
224 vdwioffsetptr0+vdwjidx0B,
225 vdwioffsetptr0+vdwjidx0C,
226 vdwioffsetptr0+vdwjidx0D,
227 vdwioffsetptr0+vdwjidx0E,
228 vdwioffsetptr0+vdwjidx0F,
229 vdwioffsetptr0+vdwjidx0G,
230 vdwioffsetptr0+vdwjidx0H,
233 /* Calculate table index by multiplying r with table scale and truncate to integer */
234 rt = _mm256_mul_ps(r00,vftabscale);
235 vfitab = _mm256_cvttps_epi32(rt);
236 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
237 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
238 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
239 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
240 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
241 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
243 /* CUBIC SPLINE TABLE ELECTROSTATICS */
244 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
245 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
246 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
247 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
248 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
249 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
250 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
251 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
252 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
253 Heps = _mm256_mul_ps(vfeps,H);
254 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
255 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
256 velec = _mm256_mul_ps(qq00,VV);
257 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
258 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
260 /* CUBIC SPLINE TABLE DISPERSION */
261 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
262 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
263 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
264 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
265 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
266 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
267 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
268 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
269 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
270 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
271 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
272 Heps = _mm256_mul_ps(vfeps,H);
273 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
274 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
275 vvdw6 = _mm256_mul_ps(c6_00,VV);
276 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
277 fvdw6 = _mm256_mul_ps(c6_00,FF);
279 /* CUBIC SPLINE TABLE REPULSION */
280 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
281 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
282 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
283 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
284 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
285 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
286 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
287 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
288 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
289 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
290 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
291 Heps = _mm256_mul_ps(vfeps,H);
292 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
293 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
294 vvdw12 = _mm256_mul_ps(c12_00,VV);
295 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
296 fvdw12 = _mm256_mul_ps(c12_00,FF);
297 vvdw = _mm256_add_ps(vvdw12,vvdw6);
298 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
300 /* Update potential sum for this i atom from the interaction with this j atom. */
301 velecsum = _mm256_add_ps(velecsum,velec);
302 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
304 fscal = _mm256_add_ps(felec,fvdw);
306 /* Calculate temporary vectorial force */
307 tx = _mm256_mul_ps(fscal,dx00);
308 ty = _mm256_mul_ps(fscal,dy00);
309 tz = _mm256_mul_ps(fscal,dz00);
311 /* Update vectorial force */
312 fix0 = _mm256_add_ps(fix0,tx);
313 fiy0 = _mm256_add_ps(fiy0,ty);
314 fiz0 = _mm256_add_ps(fiz0,tz);
316 fjptrA = f+j_coord_offsetA;
317 fjptrB = f+j_coord_offsetB;
318 fjptrC = f+j_coord_offsetC;
319 fjptrD = f+j_coord_offsetD;
320 fjptrE = f+j_coord_offsetE;
321 fjptrF = f+j_coord_offsetF;
322 fjptrG = f+j_coord_offsetG;
323 fjptrH = f+j_coord_offsetH;
324 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
326 /* Inner loop uses 73 flops */
332 /* Get j neighbor index, and coordinate index */
333 jnrlistA = jjnr[jidx];
334 jnrlistB = jjnr[jidx+1];
335 jnrlistC = jjnr[jidx+2];
336 jnrlistD = jjnr[jidx+3];
337 jnrlistE = jjnr[jidx+4];
338 jnrlistF = jjnr[jidx+5];
339 jnrlistG = jjnr[jidx+6];
340 jnrlistH = jjnr[jidx+7];
341 /* Sign of each element will be negative for non-real atoms.
342 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
343 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
345 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
346 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
348 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
349 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
350 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
351 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
352 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
353 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
354 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
355 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
356 j_coord_offsetA = DIM*jnrA;
357 j_coord_offsetB = DIM*jnrB;
358 j_coord_offsetC = DIM*jnrC;
359 j_coord_offsetD = DIM*jnrD;
360 j_coord_offsetE = DIM*jnrE;
361 j_coord_offsetF = DIM*jnrF;
362 j_coord_offsetG = DIM*jnrG;
363 j_coord_offsetH = DIM*jnrH;
365 /* load j atom coordinates */
366 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
367 x+j_coord_offsetC,x+j_coord_offsetD,
368 x+j_coord_offsetE,x+j_coord_offsetF,
369 x+j_coord_offsetG,x+j_coord_offsetH,
372 /* Calculate displacement vector */
373 dx00 = _mm256_sub_ps(ix0,jx0);
374 dy00 = _mm256_sub_ps(iy0,jy0);
375 dz00 = _mm256_sub_ps(iz0,jz0);
377 /* Calculate squared distance and things based on it */
378 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
380 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
382 /* Load parameters for j particles */
383 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
384 charge+jnrC+0,charge+jnrD+0,
385 charge+jnrE+0,charge+jnrF+0,
386 charge+jnrG+0,charge+jnrH+0);
387 vdwjidx0A = 2*vdwtype[jnrA+0];
388 vdwjidx0B = 2*vdwtype[jnrB+0];
389 vdwjidx0C = 2*vdwtype[jnrC+0];
390 vdwjidx0D = 2*vdwtype[jnrD+0];
391 vdwjidx0E = 2*vdwtype[jnrE+0];
392 vdwjidx0F = 2*vdwtype[jnrF+0];
393 vdwjidx0G = 2*vdwtype[jnrG+0];
394 vdwjidx0H = 2*vdwtype[jnrH+0];
396 /**************************
397 * CALCULATE INTERACTIONS *
398 **************************/
400 r00 = _mm256_mul_ps(rsq00,rinv00);
401 r00 = _mm256_andnot_ps(dummy_mask,r00);
403 /* Compute parameters for interactions between i and j atoms */
404 qq00 = _mm256_mul_ps(iq0,jq0);
405 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
406 vdwioffsetptr0+vdwjidx0B,
407 vdwioffsetptr0+vdwjidx0C,
408 vdwioffsetptr0+vdwjidx0D,
409 vdwioffsetptr0+vdwjidx0E,
410 vdwioffsetptr0+vdwjidx0F,
411 vdwioffsetptr0+vdwjidx0G,
412 vdwioffsetptr0+vdwjidx0H,
415 /* Calculate table index by multiplying r with table scale and truncate to integer */
416 rt = _mm256_mul_ps(r00,vftabscale);
417 vfitab = _mm256_cvttps_epi32(rt);
418 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
419 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
420 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
421 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
422 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
423 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
425 /* CUBIC SPLINE TABLE ELECTROSTATICS */
426 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
427 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
428 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
429 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
430 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
431 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
432 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
433 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
434 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
435 Heps = _mm256_mul_ps(vfeps,H);
436 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
437 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
438 velec = _mm256_mul_ps(qq00,VV);
439 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
440 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
442 /* CUBIC SPLINE TABLE DISPERSION */
443 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
444 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
445 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
446 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
447 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
448 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
449 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
450 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
451 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
452 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
453 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
454 Heps = _mm256_mul_ps(vfeps,H);
455 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
456 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
457 vvdw6 = _mm256_mul_ps(c6_00,VV);
458 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
459 fvdw6 = _mm256_mul_ps(c6_00,FF);
461 /* CUBIC SPLINE TABLE REPULSION */
462 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
463 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
464 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
465 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
466 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
467 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
468 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
469 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
470 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
471 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
472 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
473 Heps = _mm256_mul_ps(vfeps,H);
474 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
475 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
476 vvdw12 = _mm256_mul_ps(c12_00,VV);
477 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
478 fvdw12 = _mm256_mul_ps(c12_00,FF);
479 vvdw = _mm256_add_ps(vvdw12,vvdw6);
480 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
482 /* Update potential sum for this i atom from the interaction with this j atom. */
483 velec = _mm256_andnot_ps(dummy_mask,velec);
484 velecsum = _mm256_add_ps(velecsum,velec);
485 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
486 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
488 fscal = _mm256_add_ps(felec,fvdw);
490 fscal = _mm256_andnot_ps(dummy_mask,fscal);
492 /* Calculate temporary vectorial force */
493 tx = _mm256_mul_ps(fscal,dx00);
494 ty = _mm256_mul_ps(fscal,dy00);
495 tz = _mm256_mul_ps(fscal,dz00);
497 /* Update vectorial force */
498 fix0 = _mm256_add_ps(fix0,tx);
499 fiy0 = _mm256_add_ps(fiy0,ty);
500 fiz0 = _mm256_add_ps(fiz0,tz);
502 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
503 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
504 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
505 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
506 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
507 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
508 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
509 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
510 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
512 /* Inner loop uses 74 flops */
515 /* End of innermost loop */
517 gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
518 f+i_coord_offset,fshift+i_shift_offset);
521 /* Update potential energies */
522 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
523 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
525 /* Increment number of inner iterations */
526 inneriter += j_index_end - j_index_start;
528 /* Outer loop uses 9 flops */
531 /* Increment number of outer iterations */
534 /* Update outer/inner flops */
536 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*74);
539 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_256_single
540 * Electrostatics interaction: CubicSplineTable
541 * VdW interaction: CubicSplineTable
542 * Geometry: Particle-Particle
543 * Calculate force/pot: Force
546 nb_kernel_ElecCSTab_VdwCSTab_GeomP1P1_F_avx_256_single
547 (t_nblist * gmx_restrict nlist,
548 rvec * gmx_restrict xx,
549 rvec * gmx_restrict ff,
550 t_forcerec * gmx_restrict fr,
551 t_mdatoms * gmx_restrict mdatoms,
552 nb_kernel_data_t * gmx_restrict kernel_data,
553 t_nrnb * gmx_restrict nrnb)
555 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
556 * just 0 for non-waters.
557 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
558 * jnr indices corresponding to data put in the four positions in the SIMD register.
560 int i_shift_offset,i_coord_offset,outeriter,inneriter;
561 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
562 int jnrA,jnrB,jnrC,jnrD;
563 int jnrE,jnrF,jnrG,jnrH;
564 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
565 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
566 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
567 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
568 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
570 real *shiftvec,*fshift,*x,*f;
571 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
573 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
574 real * vdwioffsetptr0;
575 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
576 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
577 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
578 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
579 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
582 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
585 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
586 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
588 __m128i vfitab_lo,vfitab_hi;
589 __m128i ifour = _mm_set1_epi32(4);
590 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
592 __m256 dummy_mask,cutoff_mask;
593 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
594 __m256 one = _mm256_set1_ps(1.0);
595 __m256 two = _mm256_set1_ps(2.0);
601 jindex = nlist->jindex;
603 shiftidx = nlist->shift;
605 shiftvec = fr->shift_vec[0];
606 fshift = fr->fshift[0];
607 facel = _mm256_set1_ps(fr->epsfac);
608 charge = mdatoms->chargeA;
609 nvdwtype = fr->ntype;
611 vdwtype = mdatoms->typeA;
613 vftab = kernel_data->table_elec_vdw->data;
614 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
616 /* Avoid stupid compiler warnings */
617 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
630 for(iidx=0;iidx<4*DIM;iidx++)
635 /* Start outer loop over neighborlists */
636 for(iidx=0; iidx<nri; iidx++)
638 /* Load shift vector for this list */
639 i_shift_offset = DIM*shiftidx[iidx];
641 /* Load limits for loop over neighbors */
642 j_index_start = jindex[iidx];
643 j_index_end = jindex[iidx+1];
645 /* Get outer coordinate index */
647 i_coord_offset = DIM*inr;
649 /* Load i particle coords and add shift vector */
650 gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
652 fix0 = _mm256_setzero_ps();
653 fiy0 = _mm256_setzero_ps();
654 fiz0 = _mm256_setzero_ps();
656 /* Load parameters for i particles */
657 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
658 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
660 /* Start inner kernel loop */
661 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
664 /* Get j neighbor index, and coordinate index */
673 j_coord_offsetA = DIM*jnrA;
674 j_coord_offsetB = DIM*jnrB;
675 j_coord_offsetC = DIM*jnrC;
676 j_coord_offsetD = DIM*jnrD;
677 j_coord_offsetE = DIM*jnrE;
678 j_coord_offsetF = DIM*jnrF;
679 j_coord_offsetG = DIM*jnrG;
680 j_coord_offsetH = DIM*jnrH;
682 /* load j atom coordinates */
683 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
684 x+j_coord_offsetC,x+j_coord_offsetD,
685 x+j_coord_offsetE,x+j_coord_offsetF,
686 x+j_coord_offsetG,x+j_coord_offsetH,
689 /* Calculate displacement vector */
690 dx00 = _mm256_sub_ps(ix0,jx0);
691 dy00 = _mm256_sub_ps(iy0,jy0);
692 dz00 = _mm256_sub_ps(iz0,jz0);
694 /* Calculate squared distance and things based on it */
695 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
697 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
699 /* Load parameters for j particles */
700 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
701 charge+jnrC+0,charge+jnrD+0,
702 charge+jnrE+0,charge+jnrF+0,
703 charge+jnrG+0,charge+jnrH+0);
704 vdwjidx0A = 2*vdwtype[jnrA+0];
705 vdwjidx0B = 2*vdwtype[jnrB+0];
706 vdwjidx0C = 2*vdwtype[jnrC+0];
707 vdwjidx0D = 2*vdwtype[jnrD+0];
708 vdwjidx0E = 2*vdwtype[jnrE+0];
709 vdwjidx0F = 2*vdwtype[jnrF+0];
710 vdwjidx0G = 2*vdwtype[jnrG+0];
711 vdwjidx0H = 2*vdwtype[jnrH+0];
713 /**************************
714 * CALCULATE INTERACTIONS *
715 **************************/
717 r00 = _mm256_mul_ps(rsq00,rinv00);
719 /* Compute parameters for interactions between i and j atoms */
720 qq00 = _mm256_mul_ps(iq0,jq0);
721 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
722 vdwioffsetptr0+vdwjidx0B,
723 vdwioffsetptr0+vdwjidx0C,
724 vdwioffsetptr0+vdwjidx0D,
725 vdwioffsetptr0+vdwjidx0E,
726 vdwioffsetptr0+vdwjidx0F,
727 vdwioffsetptr0+vdwjidx0G,
728 vdwioffsetptr0+vdwjidx0H,
731 /* Calculate table index by multiplying r with table scale and truncate to integer */
732 rt = _mm256_mul_ps(r00,vftabscale);
733 vfitab = _mm256_cvttps_epi32(rt);
734 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
735 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
736 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
737 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
738 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
739 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
741 /* CUBIC SPLINE TABLE ELECTROSTATICS */
742 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
743 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
744 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
745 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
746 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
747 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
748 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
749 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
750 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
751 Heps = _mm256_mul_ps(vfeps,H);
752 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
753 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
754 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
756 /* CUBIC SPLINE TABLE DISPERSION */
757 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
758 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
759 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
760 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
761 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
762 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
763 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
764 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
765 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
766 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
767 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
768 Heps = _mm256_mul_ps(vfeps,H);
769 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
770 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
771 fvdw6 = _mm256_mul_ps(c6_00,FF);
773 /* CUBIC SPLINE TABLE REPULSION */
774 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
775 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
776 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
777 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
778 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
779 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
780 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
781 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
782 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
783 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
784 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
785 Heps = _mm256_mul_ps(vfeps,H);
786 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
787 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
788 fvdw12 = _mm256_mul_ps(c12_00,FF);
789 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
791 fscal = _mm256_add_ps(felec,fvdw);
793 /* Calculate temporary vectorial force */
794 tx = _mm256_mul_ps(fscal,dx00);
795 ty = _mm256_mul_ps(fscal,dy00);
796 tz = _mm256_mul_ps(fscal,dz00);
798 /* Update vectorial force */
799 fix0 = _mm256_add_ps(fix0,tx);
800 fiy0 = _mm256_add_ps(fiy0,ty);
801 fiz0 = _mm256_add_ps(fiz0,tz);
803 fjptrA = f+j_coord_offsetA;
804 fjptrB = f+j_coord_offsetB;
805 fjptrC = f+j_coord_offsetC;
806 fjptrD = f+j_coord_offsetD;
807 fjptrE = f+j_coord_offsetE;
808 fjptrF = f+j_coord_offsetF;
809 fjptrG = f+j_coord_offsetG;
810 fjptrH = f+j_coord_offsetH;
811 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
813 /* Inner loop uses 61 flops */
819 /* Get j neighbor index, and coordinate index */
820 jnrlistA = jjnr[jidx];
821 jnrlistB = jjnr[jidx+1];
822 jnrlistC = jjnr[jidx+2];
823 jnrlistD = jjnr[jidx+3];
824 jnrlistE = jjnr[jidx+4];
825 jnrlistF = jjnr[jidx+5];
826 jnrlistG = jjnr[jidx+6];
827 jnrlistH = jjnr[jidx+7];
828 /* Sign of each element will be negative for non-real atoms.
829 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
830 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
832 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
833 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
835 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
836 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
837 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
838 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
839 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
840 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
841 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
842 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
843 j_coord_offsetA = DIM*jnrA;
844 j_coord_offsetB = DIM*jnrB;
845 j_coord_offsetC = DIM*jnrC;
846 j_coord_offsetD = DIM*jnrD;
847 j_coord_offsetE = DIM*jnrE;
848 j_coord_offsetF = DIM*jnrF;
849 j_coord_offsetG = DIM*jnrG;
850 j_coord_offsetH = DIM*jnrH;
852 /* load j atom coordinates */
853 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
854 x+j_coord_offsetC,x+j_coord_offsetD,
855 x+j_coord_offsetE,x+j_coord_offsetF,
856 x+j_coord_offsetG,x+j_coord_offsetH,
859 /* Calculate displacement vector */
860 dx00 = _mm256_sub_ps(ix0,jx0);
861 dy00 = _mm256_sub_ps(iy0,jy0);
862 dz00 = _mm256_sub_ps(iz0,jz0);
864 /* Calculate squared distance and things based on it */
865 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
867 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
869 /* Load parameters for j particles */
870 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
871 charge+jnrC+0,charge+jnrD+0,
872 charge+jnrE+0,charge+jnrF+0,
873 charge+jnrG+0,charge+jnrH+0);
874 vdwjidx0A = 2*vdwtype[jnrA+0];
875 vdwjidx0B = 2*vdwtype[jnrB+0];
876 vdwjidx0C = 2*vdwtype[jnrC+0];
877 vdwjidx0D = 2*vdwtype[jnrD+0];
878 vdwjidx0E = 2*vdwtype[jnrE+0];
879 vdwjidx0F = 2*vdwtype[jnrF+0];
880 vdwjidx0G = 2*vdwtype[jnrG+0];
881 vdwjidx0H = 2*vdwtype[jnrH+0];
883 /**************************
884 * CALCULATE INTERACTIONS *
885 **************************/
887 r00 = _mm256_mul_ps(rsq00,rinv00);
888 r00 = _mm256_andnot_ps(dummy_mask,r00);
890 /* Compute parameters for interactions between i and j atoms */
891 qq00 = _mm256_mul_ps(iq0,jq0);
892 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
893 vdwioffsetptr0+vdwjidx0B,
894 vdwioffsetptr0+vdwjidx0C,
895 vdwioffsetptr0+vdwjidx0D,
896 vdwioffsetptr0+vdwjidx0E,
897 vdwioffsetptr0+vdwjidx0F,
898 vdwioffsetptr0+vdwjidx0G,
899 vdwioffsetptr0+vdwjidx0H,
902 /* Calculate table index by multiplying r with table scale and truncate to integer */
903 rt = _mm256_mul_ps(r00,vftabscale);
904 vfitab = _mm256_cvttps_epi32(rt);
905 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
906 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
907 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
908 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
909 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
910 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
912 /* CUBIC SPLINE TABLE ELECTROSTATICS */
913 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
914 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
915 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
916 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
917 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
918 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
919 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
920 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
921 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
922 Heps = _mm256_mul_ps(vfeps,H);
923 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
924 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
925 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
927 /* CUBIC SPLINE TABLE DISPERSION */
928 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
929 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
930 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
931 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
932 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
933 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
934 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
935 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
936 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
937 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
938 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
939 Heps = _mm256_mul_ps(vfeps,H);
940 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
941 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
942 fvdw6 = _mm256_mul_ps(c6_00,FF);
944 /* CUBIC SPLINE TABLE REPULSION */
945 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
946 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
947 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
948 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
949 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
950 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
951 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
952 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
953 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
954 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
955 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
956 Heps = _mm256_mul_ps(vfeps,H);
957 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
958 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
959 fvdw12 = _mm256_mul_ps(c12_00,FF);
960 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
962 fscal = _mm256_add_ps(felec,fvdw);
964 fscal = _mm256_andnot_ps(dummy_mask,fscal);
966 /* Calculate temporary vectorial force */
967 tx = _mm256_mul_ps(fscal,dx00);
968 ty = _mm256_mul_ps(fscal,dy00);
969 tz = _mm256_mul_ps(fscal,dz00);
971 /* Update vectorial force */
972 fix0 = _mm256_add_ps(fix0,tx);
973 fiy0 = _mm256_add_ps(fiy0,ty);
974 fiz0 = _mm256_add_ps(fiz0,tz);
976 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
977 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
978 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
979 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
980 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
981 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
982 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
983 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
984 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
986 /* Inner loop uses 62 flops */
989 /* End of innermost loop */
991 gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
992 f+i_coord_offset,fshift+i_shift_offset);
994 /* Increment number of inner iterations */
995 inneriter += j_index_end - j_index_start;
997 /* Outer loop uses 7 flops */
1000 /* Increment number of outer iterations */
1003 /* Update outer/inner flops */
1005 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*62);