2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_256_single
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: CubicSplineTable
40 * Geometry: Particle-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
75 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
77 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
80 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
83 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
84 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
86 __m128i vfitab_lo,vfitab_hi;
87 __m128i ifour = _mm_set1_epi32(4);
88 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
90 __m256 dummy_mask,cutoff_mask;
91 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
92 __m256 one = _mm256_set1_ps(1.0);
93 __m256 two = _mm256_set1_ps(2.0);
99 jindex = nlist->jindex;
101 shiftidx = nlist->shift;
103 shiftvec = fr->shift_vec[0];
104 fshift = fr->fshift[0];
105 facel = _mm256_set1_ps(fr->epsfac);
106 charge = mdatoms->chargeA;
107 nvdwtype = fr->ntype;
109 vdwtype = mdatoms->typeA;
111 vftab = kernel_data->table_vdw->data;
112 vftabscale = _mm256_set1_ps(kernel_data->table_vdw->scale);
114 /* Avoid stupid compiler warnings */
115 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
128 for(iidx=0;iidx<4*DIM;iidx++)
133 /* Start outer loop over neighborlists */
134 for(iidx=0; iidx<nri; iidx++)
136 /* Load shift vector for this list */
137 i_shift_offset = DIM*shiftidx[iidx];
139 /* Load limits for loop over neighbors */
140 j_index_start = jindex[iidx];
141 j_index_end = jindex[iidx+1];
143 /* Get outer coordinate index */
145 i_coord_offset = DIM*inr;
147 /* Load i particle coords and add shift vector */
148 gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
150 fix0 = _mm256_setzero_ps();
151 fiy0 = _mm256_setzero_ps();
152 fiz0 = _mm256_setzero_ps();
154 /* Load parameters for i particles */
155 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
156 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
158 /* Reset potential sums */
159 velecsum = _mm256_setzero_ps();
160 vvdwsum = _mm256_setzero_ps();
162 /* Start inner kernel loop */
163 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
166 /* Get j neighbor index, and coordinate index */
175 j_coord_offsetA = DIM*jnrA;
176 j_coord_offsetB = DIM*jnrB;
177 j_coord_offsetC = DIM*jnrC;
178 j_coord_offsetD = DIM*jnrD;
179 j_coord_offsetE = DIM*jnrE;
180 j_coord_offsetF = DIM*jnrF;
181 j_coord_offsetG = DIM*jnrG;
182 j_coord_offsetH = DIM*jnrH;
184 /* load j atom coordinates */
185 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
186 x+j_coord_offsetC,x+j_coord_offsetD,
187 x+j_coord_offsetE,x+j_coord_offsetF,
188 x+j_coord_offsetG,x+j_coord_offsetH,
191 /* Calculate displacement vector */
192 dx00 = _mm256_sub_ps(ix0,jx0);
193 dy00 = _mm256_sub_ps(iy0,jy0);
194 dz00 = _mm256_sub_ps(iz0,jz0);
196 /* Calculate squared distance and things based on it */
197 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
199 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
201 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
203 /* Load parameters for j particles */
204 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
205 charge+jnrC+0,charge+jnrD+0,
206 charge+jnrE+0,charge+jnrF+0,
207 charge+jnrG+0,charge+jnrH+0);
208 vdwjidx0A = 2*vdwtype[jnrA+0];
209 vdwjidx0B = 2*vdwtype[jnrB+0];
210 vdwjidx0C = 2*vdwtype[jnrC+0];
211 vdwjidx0D = 2*vdwtype[jnrD+0];
212 vdwjidx0E = 2*vdwtype[jnrE+0];
213 vdwjidx0F = 2*vdwtype[jnrF+0];
214 vdwjidx0G = 2*vdwtype[jnrG+0];
215 vdwjidx0H = 2*vdwtype[jnrH+0];
217 /**************************
218 * CALCULATE INTERACTIONS *
219 **************************/
221 r00 = _mm256_mul_ps(rsq00,rinv00);
223 /* Compute parameters for interactions between i and j atoms */
224 qq00 = _mm256_mul_ps(iq0,jq0);
225 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
226 vdwioffsetptr0+vdwjidx0B,
227 vdwioffsetptr0+vdwjidx0C,
228 vdwioffsetptr0+vdwjidx0D,
229 vdwioffsetptr0+vdwjidx0E,
230 vdwioffsetptr0+vdwjidx0F,
231 vdwioffsetptr0+vdwjidx0G,
232 vdwioffsetptr0+vdwjidx0H,
235 /* Calculate table index by multiplying r with table scale and truncate to integer */
236 rt = _mm256_mul_ps(r00,vftabscale);
237 vfitab = _mm256_cvttps_epi32(rt);
238 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
239 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
240 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
241 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
242 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
243 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
245 /* COULOMB ELECTROSTATICS */
246 velec = _mm256_mul_ps(qq00,rinv00);
247 felec = _mm256_mul_ps(velec,rinvsq00);
249 /* CUBIC SPLINE TABLE DISPERSION */
250 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
251 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
252 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
253 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
254 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
255 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
256 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
257 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
258 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
259 Heps = _mm256_mul_ps(vfeps,H);
260 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
261 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
262 vvdw6 = _mm256_mul_ps(c6_00,VV);
263 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
264 fvdw6 = _mm256_mul_ps(c6_00,FF);
266 /* CUBIC SPLINE TABLE REPULSION */
267 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
268 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
269 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
270 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
271 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
272 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
273 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
274 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
275 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
276 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
277 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
278 Heps = _mm256_mul_ps(vfeps,H);
279 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
280 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
281 vvdw12 = _mm256_mul_ps(c12_00,VV);
282 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
283 fvdw12 = _mm256_mul_ps(c12_00,FF);
284 vvdw = _mm256_add_ps(vvdw12,vvdw6);
285 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
287 /* Update potential sum for this i atom from the interaction with this j atom. */
288 velecsum = _mm256_add_ps(velecsum,velec);
289 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
291 fscal = _mm256_add_ps(felec,fvdw);
293 /* Calculate temporary vectorial force */
294 tx = _mm256_mul_ps(fscal,dx00);
295 ty = _mm256_mul_ps(fscal,dy00);
296 tz = _mm256_mul_ps(fscal,dz00);
298 /* Update vectorial force */
299 fix0 = _mm256_add_ps(fix0,tx);
300 fiy0 = _mm256_add_ps(fiy0,ty);
301 fiz0 = _mm256_add_ps(fiz0,tz);
303 fjptrA = f+j_coord_offsetA;
304 fjptrB = f+j_coord_offsetB;
305 fjptrC = f+j_coord_offsetC;
306 fjptrD = f+j_coord_offsetD;
307 fjptrE = f+j_coord_offsetE;
308 fjptrF = f+j_coord_offsetF;
309 fjptrG = f+j_coord_offsetG;
310 fjptrH = f+j_coord_offsetH;
311 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
313 /* Inner loop uses 62 flops */
319 /* Get j neighbor index, and coordinate index */
320 jnrlistA = jjnr[jidx];
321 jnrlistB = jjnr[jidx+1];
322 jnrlistC = jjnr[jidx+2];
323 jnrlistD = jjnr[jidx+3];
324 jnrlistE = jjnr[jidx+4];
325 jnrlistF = jjnr[jidx+5];
326 jnrlistG = jjnr[jidx+6];
327 jnrlistH = jjnr[jidx+7];
328 /* Sign of each element will be negative for non-real atoms.
329 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
330 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
332 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
333 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
335 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
336 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
337 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
338 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
339 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
340 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
341 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
342 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
343 j_coord_offsetA = DIM*jnrA;
344 j_coord_offsetB = DIM*jnrB;
345 j_coord_offsetC = DIM*jnrC;
346 j_coord_offsetD = DIM*jnrD;
347 j_coord_offsetE = DIM*jnrE;
348 j_coord_offsetF = DIM*jnrF;
349 j_coord_offsetG = DIM*jnrG;
350 j_coord_offsetH = DIM*jnrH;
352 /* load j atom coordinates */
353 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
354 x+j_coord_offsetC,x+j_coord_offsetD,
355 x+j_coord_offsetE,x+j_coord_offsetF,
356 x+j_coord_offsetG,x+j_coord_offsetH,
359 /* Calculate displacement vector */
360 dx00 = _mm256_sub_ps(ix0,jx0);
361 dy00 = _mm256_sub_ps(iy0,jy0);
362 dz00 = _mm256_sub_ps(iz0,jz0);
364 /* Calculate squared distance and things based on it */
365 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
367 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
369 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
371 /* Load parameters for j particles */
372 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
373 charge+jnrC+0,charge+jnrD+0,
374 charge+jnrE+0,charge+jnrF+0,
375 charge+jnrG+0,charge+jnrH+0);
376 vdwjidx0A = 2*vdwtype[jnrA+0];
377 vdwjidx0B = 2*vdwtype[jnrB+0];
378 vdwjidx0C = 2*vdwtype[jnrC+0];
379 vdwjidx0D = 2*vdwtype[jnrD+0];
380 vdwjidx0E = 2*vdwtype[jnrE+0];
381 vdwjidx0F = 2*vdwtype[jnrF+0];
382 vdwjidx0G = 2*vdwtype[jnrG+0];
383 vdwjidx0H = 2*vdwtype[jnrH+0];
385 /**************************
386 * CALCULATE INTERACTIONS *
387 **************************/
389 r00 = _mm256_mul_ps(rsq00,rinv00);
390 r00 = _mm256_andnot_ps(dummy_mask,r00);
392 /* Compute parameters for interactions between i and j atoms */
393 qq00 = _mm256_mul_ps(iq0,jq0);
394 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
395 vdwioffsetptr0+vdwjidx0B,
396 vdwioffsetptr0+vdwjidx0C,
397 vdwioffsetptr0+vdwjidx0D,
398 vdwioffsetptr0+vdwjidx0E,
399 vdwioffsetptr0+vdwjidx0F,
400 vdwioffsetptr0+vdwjidx0G,
401 vdwioffsetptr0+vdwjidx0H,
404 /* Calculate table index by multiplying r with table scale and truncate to integer */
405 rt = _mm256_mul_ps(r00,vftabscale);
406 vfitab = _mm256_cvttps_epi32(rt);
407 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
408 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
409 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
410 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
411 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
412 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
414 /* COULOMB ELECTROSTATICS */
415 velec = _mm256_mul_ps(qq00,rinv00);
416 felec = _mm256_mul_ps(velec,rinvsq00);
418 /* CUBIC SPLINE TABLE DISPERSION */
419 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
420 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
421 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
422 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
423 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
424 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
425 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
426 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
427 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
428 Heps = _mm256_mul_ps(vfeps,H);
429 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
430 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
431 vvdw6 = _mm256_mul_ps(c6_00,VV);
432 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
433 fvdw6 = _mm256_mul_ps(c6_00,FF);
435 /* CUBIC SPLINE TABLE REPULSION */
436 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
437 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
438 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
439 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
440 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
441 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
442 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
443 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
444 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
445 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
446 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
447 Heps = _mm256_mul_ps(vfeps,H);
448 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
449 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
450 vvdw12 = _mm256_mul_ps(c12_00,VV);
451 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
452 fvdw12 = _mm256_mul_ps(c12_00,FF);
453 vvdw = _mm256_add_ps(vvdw12,vvdw6);
454 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
456 /* Update potential sum for this i atom from the interaction with this j atom. */
457 velec = _mm256_andnot_ps(dummy_mask,velec);
458 velecsum = _mm256_add_ps(velecsum,velec);
459 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
460 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
462 fscal = _mm256_add_ps(felec,fvdw);
464 fscal = _mm256_andnot_ps(dummy_mask,fscal);
466 /* Calculate temporary vectorial force */
467 tx = _mm256_mul_ps(fscal,dx00);
468 ty = _mm256_mul_ps(fscal,dy00);
469 tz = _mm256_mul_ps(fscal,dz00);
471 /* Update vectorial force */
472 fix0 = _mm256_add_ps(fix0,tx);
473 fiy0 = _mm256_add_ps(fiy0,ty);
474 fiz0 = _mm256_add_ps(fiz0,tz);
476 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
477 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
478 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
479 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
480 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
481 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
482 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
483 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
484 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
486 /* Inner loop uses 63 flops */
489 /* End of innermost loop */
491 gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
492 f+i_coord_offset,fshift+i_shift_offset);
495 /* Update potential energies */
496 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
497 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
499 /* Increment number of inner iterations */
500 inneriter += j_index_end - j_index_start;
502 /* Outer loop uses 9 flops */
505 /* Increment number of outer iterations */
508 /* Update outer/inner flops */
510 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_VF,outeriter*9 + inneriter*63);
513 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_256_single
514 * Electrostatics interaction: Coulomb
515 * VdW interaction: CubicSplineTable
516 * Geometry: Particle-Particle
517 * Calculate force/pot: Force
520 nb_kernel_ElecCoul_VdwCSTab_GeomP1P1_F_avx_256_single
521 (t_nblist * gmx_restrict nlist,
522 rvec * gmx_restrict xx,
523 rvec * gmx_restrict ff,
524 t_forcerec * gmx_restrict fr,
525 t_mdatoms * gmx_restrict mdatoms,
526 nb_kernel_data_t * gmx_restrict kernel_data,
527 t_nrnb * gmx_restrict nrnb)
529 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
530 * just 0 for non-waters.
531 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
532 * jnr indices corresponding to data put in the four positions in the SIMD register.
534 int i_shift_offset,i_coord_offset,outeriter,inneriter;
535 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
536 int jnrA,jnrB,jnrC,jnrD;
537 int jnrE,jnrF,jnrG,jnrH;
538 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
539 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
540 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
541 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
542 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
544 real *shiftvec,*fshift,*x,*f;
545 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
547 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
548 real * vdwioffsetptr0;
549 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
550 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
551 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
552 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
553 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
556 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
559 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
560 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
562 __m128i vfitab_lo,vfitab_hi;
563 __m128i ifour = _mm_set1_epi32(4);
564 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
566 __m256 dummy_mask,cutoff_mask;
567 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
568 __m256 one = _mm256_set1_ps(1.0);
569 __m256 two = _mm256_set1_ps(2.0);
575 jindex = nlist->jindex;
577 shiftidx = nlist->shift;
579 shiftvec = fr->shift_vec[0];
580 fshift = fr->fshift[0];
581 facel = _mm256_set1_ps(fr->epsfac);
582 charge = mdatoms->chargeA;
583 nvdwtype = fr->ntype;
585 vdwtype = mdatoms->typeA;
587 vftab = kernel_data->table_vdw->data;
588 vftabscale = _mm256_set1_ps(kernel_data->table_vdw->scale);
590 /* Avoid stupid compiler warnings */
591 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
604 for(iidx=0;iidx<4*DIM;iidx++)
609 /* Start outer loop over neighborlists */
610 for(iidx=0; iidx<nri; iidx++)
612 /* Load shift vector for this list */
613 i_shift_offset = DIM*shiftidx[iidx];
615 /* Load limits for loop over neighbors */
616 j_index_start = jindex[iidx];
617 j_index_end = jindex[iidx+1];
619 /* Get outer coordinate index */
621 i_coord_offset = DIM*inr;
623 /* Load i particle coords and add shift vector */
624 gmx_mm256_load_shift_and_1rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,&ix0,&iy0,&iz0);
626 fix0 = _mm256_setzero_ps();
627 fiy0 = _mm256_setzero_ps();
628 fiz0 = _mm256_setzero_ps();
630 /* Load parameters for i particles */
631 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
632 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
634 /* Start inner kernel loop */
635 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
638 /* Get j neighbor index, and coordinate index */
647 j_coord_offsetA = DIM*jnrA;
648 j_coord_offsetB = DIM*jnrB;
649 j_coord_offsetC = DIM*jnrC;
650 j_coord_offsetD = DIM*jnrD;
651 j_coord_offsetE = DIM*jnrE;
652 j_coord_offsetF = DIM*jnrF;
653 j_coord_offsetG = DIM*jnrG;
654 j_coord_offsetH = DIM*jnrH;
656 /* load j atom coordinates */
657 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
658 x+j_coord_offsetC,x+j_coord_offsetD,
659 x+j_coord_offsetE,x+j_coord_offsetF,
660 x+j_coord_offsetG,x+j_coord_offsetH,
663 /* Calculate displacement vector */
664 dx00 = _mm256_sub_ps(ix0,jx0);
665 dy00 = _mm256_sub_ps(iy0,jy0);
666 dz00 = _mm256_sub_ps(iz0,jz0);
668 /* Calculate squared distance and things based on it */
669 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
671 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
673 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
675 /* Load parameters for j particles */
676 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
677 charge+jnrC+0,charge+jnrD+0,
678 charge+jnrE+0,charge+jnrF+0,
679 charge+jnrG+0,charge+jnrH+0);
680 vdwjidx0A = 2*vdwtype[jnrA+0];
681 vdwjidx0B = 2*vdwtype[jnrB+0];
682 vdwjidx0C = 2*vdwtype[jnrC+0];
683 vdwjidx0D = 2*vdwtype[jnrD+0];
684 vdwjidx0E = 2*vdwtype[jnrE+0];
685 vdwjidx0F = 2*vdwtype[jnrF+0];
686 vdwjidx0G = 2*vdwtype[jnrG+0];
687 vdwjidx0H = 2*vdwtype[jnrH+0];
689 /**************************
690 * CALCULATE INTERACTIONS *
691 **************************/
693 r00 = _mm256_mul_ps(rsq00,rinv00);
695 /* Compute parameters for interactions between i and j atoms */
696 qq00 = _mm256_mul_ps(iq0,jq0);
697 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
698 vdwioffsetptr0+vdwjidx0B,
699 vdwioffsetptr0+vdwjidx0C,
700 vdwioffsetptr0+vdwjidx0D,
701 vdwioffsetptr0+vdwjidx0E,
702 vdwioffsetptr0+vdwjidx0F,
703 vdwioffsetptr0+vdwjidx0G,
704 vdwioffsetptr0+vdwjidx0H,
707 /* Calculate table index by multiplying r with table scale and truncate to integer */
708 rt = _mm256_mul_ps(r00,vftabscale);
709 vfitab = _mm256_cvttps_epi32(rt);
710 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
711 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
712 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
713 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
714 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
715 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
717 /* COULOMB ELECTROSTATICS */
718 velec = _mm256_mul_ps(qq00,rinv00);
719 felec = _mm256_mul_ps(velec,rinvsq00);
721 /* CUBIC SPLINE TABLE DISPERSION */
722 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
723 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
724 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
725 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
726 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
727 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
728 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
729 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
730 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
731 Heps = _mm256_mul_ps(vfeps,H);
732 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
733 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
734 fvdw6 = _mm256_mul_ps(c6_00,FF);
736 /* CUBIC SPLINE TABLE REPULSION */
737 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
738 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
739 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
740 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
741 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
742 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
743 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
744 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
745 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
746 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
747 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
748 Heps = _mm256_mul_ps(vfeps,H);
749 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
750 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
751 fvdw12 = _mm256_mul_ps(c12_00,FF);
752 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
754 fscal = _mm256_add_ps(felec,fvdw);
756 /* Calculate temporary vectorial force */
757 tx = _mm256_mul_ps(fscal,dx00);
758 ty = _mm256_mul_ps(fscal,dy00);
759 tz = _mm256_mul_ps(fscal,dz00);
761 /* Update vectorial force */
762 fix0 = _mm256_add_ps(fix0,tx);
763 fiy0 = _mm256_add_ps(fiy0,ty);
764 fiz0 = _mm256_add_ps(fiz0,tz);
766 fjptrA = f+j_coord_offsetA;
767 fjptrB = f+j_coord_offsetB;
768 fjptrC = f+j_coord_offsetC;
769 fjptrD = f+j_coord_offsetD;
770 fjptrE = f+j_coord_offsetE;
771 fjptrF = f+j_coord_offsetF;
772 fjptrG = f+j_coord_offsetG;
773 fjptrH = f+j_coord_offsetH;
774 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
776 /* Inner loop uses 53 flops */
782 /* Get j neighbor index, and coordinate index */
783 jnrlistA = jjnr[jidx];
784 jnrlistB = jjnr[jidx+1];
785 jnrlistC = jjnr[jidx+2];
786 jnrlistD = jjnr[jidx+3];
787 jnrlistE = jjnr[jidx+4];
788 jnrlistF = jjnr[jidx+5];
789 jnrlistG = jjnr[jidx+6];
790 jnrlistH = jjnr[jidx+7];
791 /* Sign of each element will be negative for non-real atoms.
792 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
793 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
795 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
796 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
798 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
799 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
800 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
801 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
802 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
803 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
804 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
805 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
806 j_coord_offsetA = DIM*jnrA;
807 j_coord_offsetB = DIM*jnrB;
808 j_coord_offsetC = DIM*jnrC;
809 j_coord_offsetD = DIM*jnrD;
810 j_coord_offsetE = DIM*jnrE;
811 j_coord_offsetF = DIM*jnrF;
812 j_coord_offsetG = DIM*jnrG;
813 j_coord_offsetH = DIM*jnrH;
815 /* load j atom coordinates */
816 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
817 x+j_coord_offsetC,x+j_coord_offsetD,
818 x+j_coord_offsetE,x+j_coord_offsetF,
819 x+j_coord_offsetG,x+j_coord_offsetH,
822 /* Calculate displacement vector */
823 dx00 = _mm256_sub_ps(ix0,jx0);
824 dy00 = _mm256_sub_ps(iy0,jy0);
825 dz00 = _mm256_sub_ps(iz0,jz0);
827 /* Calculate squared distance and things based on it */
828 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
830 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
832 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
834 /* Load parameters for j particles */
835 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
836 charge+jnrC+0,charge+jnrD+0,
837 charge+jnrE+0,charge+jnrF+0,
838 charge+jnrG+0,charge+jnrH+0);
839 vdwjidx0A = 2*vdwtype[jnrA+0];
840 vdwjidx0B = 2*vdwtype[jnrB+0];
841 vdwjidx0C = 2*vdwtype[jnrC+0];
842 vdwjidx0D = 2*vdwtype[jnrD+0];
843 vdwjidx0E = 2*vdwtype[jnrE+0];
844 vdwjidx0F = 2*vdwtype[jnrF+0];
845 vdwjidx0G = 2*vdwtype[jnrG+0];
846 vdwjidx0H = 2*vdwtype[jnrH+0];
848 /**************************
849 * CALCULATE INTERACTIONS *
850 **************************/
852 r00 = _mm256_mul_ps(rsq00,rinv00);
853 r00 = _mm256_andnot_ps(dummy_mask,r00);
855 /* Compute parameters for interactions between i and j atoms */
856 qq00 = _mm256_mul_ps(iq0,jq0);
857 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
858 vdwioffsetptr0+vdwjidx0B,
859 vdwioffsetptr0+vdwjidx0C,
860 vdwioffsetptr0+vdwjidx0D,
861 vdwioffsetptr0+vdwjidx0E,
862 vdwioffsetptr0+vdwjidx0F,
863 vdwioffsetptr0+vdwjidx0G,
864 vdwioffsetptr0+vdwjidx0H,
867 /* Calculate table index by multiplying r with table scale and truncate to integer */
868 rt = _mm256_mul_ps(r00,vftabscale);
869 vfitab = _mm256_cvttps_epi32(rt);
870 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
871 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
872 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
873 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
874 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
875 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
877 /* COULOMB ELECTROSTATICS */
878 velec = _mm256_mul_ps(qq00,rinv00);
879 felec = _mm256_mul_ps(velec,rinvsq00);
881 /* CUBIC SPLINE TABLE DISPERSION */
882 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
883 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
884 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
885 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
886 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
887 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
888 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
889 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
890 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
891 Heps = _mm256_mul_ps(vfeps,H);
892 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
893 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
894 fvdw6 = _mm256_mul_ps(c6_00,FF);
896 /* CUBIC SPLINE TABLE REPULSION */
897 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
898 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
899 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
900 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
901 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
902 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
903 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
904 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
905 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
906 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
907 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
908 Heps = _mm256_mul_ps(vfeps,H);
909 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
910 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
911 fvdw12 = _mm256_mul_ps(c12_00,FF);
912 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
914 fscal = _mm256_add_ps(felec,fvdw);
916 fscal = _mm256_andnot_ps(dummy_mask,fscal);
918 /* Calculate temporary vectorial force */
919 tx = _mm256_mul_ps(fscal,dx00);
920 ty = _mm256_mul_ps(fscal,dy00);
921 tz = _mm256_mul_ps(fscal,dz00);
923 /* Update vectorial force */
924 fix0 = _mm256_add_ps(fix0,tx);
925 fiy0 = _mm256_add_ps(fiy0,ty);
926 fiz0 = _mm256_add_ps(fiz0,tz);
928 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
929 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
930 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
931 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
932 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
933 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
934 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
935 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
936 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,tx,ty,tz);
938 /* Inner loop uses 54 flops */
941 /* End of innermost loop */
943 gmx_mm256_update_iforce_1atom_swizzle_ps(fix0,fiy0,fiz0,
944 f+i_coord_offset,fshift+i_shift_offset);
946 /* Increment number of inner iterations */
947 inneriter += j_index_end - j_index_start;
949 /* Outer loop uses 7 flops */
952 /* Increment number of outer iterations */
955 /* Update outer/inner flops */
957 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_F,outeriter*7 + inneriter*54);