2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_256_single
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 real * vdwioffsetptr3;
79 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
80 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
81 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
82 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
83 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
84 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
85 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
86 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
87 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
88 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
89 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
90 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
91 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
92 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
93 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
94 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
95 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
96 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
97 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
98 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
101 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
104 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
105 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
107 __m128i vfitab_lo,vfitab_hi;
108 __m128i ifour = _mm_set1_epi32(4);
109 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
111 __m256 dummy_mask,cutoff_mask;
112 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
113 __m256 one = _mm256_set1_ps(1.0);
114 __m256 two = _mm256_set1_ps(2.0);
120 jindex = nlist->jindex;
122 shiftidx = nlist->shift;
124 shiftvec = fr->shift_vec[0];
125 fshift = fr->fshift[0];
126 facel = _mm256_set1_ps(fr->epsfac);
127 charge = mdatoms->chargeA;
128 nvdwtype = fr->ntype;
130 vdwtype = mdatoms->typeA;
132 vftab = kernel_data->table_vdw->data;
133 vftabscale = _mm256_set1_ps(kernel_data->table_vdw->scale);
135 /* Setup water-specific parameters */
136 inr = nlist->iinr[0];
137 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
138 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
139 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
140 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
142 jq1 = _mm256_set1_ps(charge[inr+1]);
143 jq2 = _mm256_set1_ps(charge[inr+2]);
144 jq3 = _mm256_set1_ps(charge[inr+3]);
145 vdwjidx0A = 2*vdwtype[inr+0];
146 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
147 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
148 qq11 = _mm256_mul_ps(iq1,jq1);
149 qq12 = _mm256_mul_ps(iq1,jq2);
150 qq13 = _mm256_mul_ps(iq1,jq3);
151 qq21 = _mm256_mul_ps(iq2,jq1);
152 qq22 = _mm256_mul_ps(iq2,jq2);
153 qq23 = _mm256_mul_ps(iq2,jq3);
154 qq31 = _mm256_mul_ps(iq3,jq1);
155 qq32 = _mm256_mul_ps(iq3,jq2);
156 qq33 = _mm256_mul_ps(iq3,jq3);
158 /* Avoid stupid compiler warnings */
159 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
172 for(iidx=0;iidx<4*DIM;iidx++)
177 /* Start outer loop over neighborlists */
178 for(iidx=0; iidx<nri; iidx++)
180 /* Load shift vector for this list */
181 i_shift_offset = DIM*shiftidx[iidx];
183 /* Load limits for loop over neighbors */
184 j_index_start = jindex[iidx];
185 j_index_end = jindex[iidx+1];
187 /* Get outer coordinate index */
189 i_coord_offset = DIM*inr;
191 /* Load i particle coords and add shift vector */
192 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
193 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
195 fix0 = _mm256_setzero_ps();
196 fiy0 = _mm256_setzero_ps();
197 fiz0 = _mm256_setzero_ps();
198 fix1 = _mm256_setzero_ps();
199 fiy1 = _mm256_setzero_ps();
200 fiz1 = _mm256_setzero_ps();
201 fix2 = _mm256_setzero_ps();
202 fiy2 = _mm256_setzero_ps();
203 fiz2 = _mm256_setzero_ps();
204 fix3 = _mm256_setzero_ps();
205 fiy3 = _mm256_setzero_ps();
206 fiz3 = _mm256_setzero_ps();
208 /* Reset potential sums */
209 velecsum = _mm256_setzero_ps();
210 vvdwsum = _mm256_setzero_ps();
212 /* Start inner kernel loop */
213 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
216 /* Get j neighbor index, and coordinate index */
225 j_coord_offsetA = DIM*jnrA;
226 j_coord_offsetB = DIM*jnrB;
227 j_coord_offsetC = DIM*jnrC;
228 j_coord_offsetD = DIM*jnrD;
229 j_coord_offsetE = DIM*jnrE;
230 j_coord_offsetF = DIM*jnrF;
231 j_coord_offsetG = DIM*jnrG;
232 j_coord_offsetH = DIM*jnrH;
234 /* load j atom coordinates */
235 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
236 x+j_coord_offsetC,x+j_coord_offsetD,
237 x+j_coord_offsetE,x+j_coord_offsetF,
238 x+j_coord_offsetG,x+j_coord_offsetH,
239 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
240 &jy2,&jz2,&jx3,&jy3,&jz3);
242 /* Calculate displacement vector */
243 dx00 = _mm256_sub_ps(ix0,jx0);
244 dy00 = _mm256_sub_ps(iy0,jy0);
245 dz00 = _mm256_sub_ps(iz0,jz0);
246 dx11 = _mm256_sub_ps(ix1,jx1);
247 dy11 = _mm256_sub_ps(iy1,jy1);
248 dz11 = _mm256_sub_ps(iz1,jz1);
249 dx12 = _mm256_sub_ps(ix1,jx2);
250 dy12 = _mm256_sub_ps(iy1,jy2);
251 dz12 = _mm256_sub_ps(iz1,jz2);
252 dx13 = _mm256_sub_ps(ix1,jx3);
253 dy13 = _mm256_sub_ps(iy1,jy3);
254 dz13 = _mm256_sub_ps(iz1,jz3);
255 dx21 = _mm256_sub_ps(ix2,jx1);
256 dy21 = _mm256_sub_ps(iy2,jy1);
257 dz21 = _mm256_sub_ps(iz2,jz1);
258 dx22 = _mm256_sub_ps(ix2,jx2);
259 dy22 = _mm256_sub_ps(iy2,jy2);
260 dz22 = _mm256_sub_ps(iz2,jz2);
261 dx23 = _mm256_sub_ps(ix2,jx3);
262 dy23 = _mm256_sub_ps(iy2,jy3);
263 dz23 = _mm256_sub_ps(iz2,jz3);
264 dx31 = _mm256_sub_ps(ix3,jx1);
265 dy31 = _mm256_sub_ps(iy3,jy1);
266 dz31 = _mm256_sub_ps(iz3,jz1);
267 dx32 = _mm256_sub_ps(ix3,jx2);
268 dy32 = _mm256_sub_ps(iy3,jy2);
269 dz32 = _mm256_sub_ps(iz3,jz2);
270 dx33 = _mm256_sub_ps(ix3,jx3);
271 dy33 = _mm256_sub_ps(iy3,jy3);
272 dz33 = _mm256_sub_ps(iz3,jz3);
274 /* Calculate squared distance and things based on it */
275 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
276 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
277 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
278 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
279 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
280 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
281 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
282 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
283 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
284 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
286 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
287 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
288 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
289 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
290 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
291 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
292 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
293 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
294 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
295 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
297 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
298 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
299 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
300 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
301 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
302 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
303 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
304 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
305 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
307 fjx0 = _mm256_setzero_ps();
308 fjy0 = _mm256_setzero_ps();
309 fjz0 = _mm256_setzero_ps();
310 fjx1 = _mm256_setzero_ps();
311 fjy1 = _mm256_setzero_ps();
312 fjz1 = _mm256_setzero_ps();
313 fjx2 = _mm256_setzero_ps();
314 fjy2 = _mm256_setzero_ps();
315 fjz2 = _mm256_setzero_ps();
316 fjx3 = _mm256_setzero_ps();
317 fjy3 = _mm256_setzero_ps();
318 fjz3 = _mm256_setzero_ps();
320 /**************************
321 * CALCULATE INTERACTIONS *
322 **************************/
324 r00 = _mm256_mul_ps(rsq00,rinv00);
326 /* Calculate table index by multiplying r with table scale and truncate to integer */
327 rt = _mm256_mul_ps(r00,vftabscale);
328 vfitab = _mm256_cvttps_epi32(rt);
329 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
330 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
331 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
332 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
333 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
334 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
336 /* CUBIC SPLINE TABLE DISPERSION */
337 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
338 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
339 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
340 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
341 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
342 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
343 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
344 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
345 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
346 Heps = _mm256_mul_ps(vfeps,H);
347 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
348 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
349 vvdw6 = _mm256_mul_ps(c6_00,VV);
350 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
351 fvdw6 = _mm256_mul_ps(c6_00,FF);
353 /* CUBIC SPLINE TABLE REPULSION */
354 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
355 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
356 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
357 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
358 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
359 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
360 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
361 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
362 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
363 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
364 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
365 Heps = _mm256_mul_ps(vfeps,H);
366 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
367 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
368 vvdw12 = _mm256_mul_ps(c12_00,VV);
369 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
370 fvdw12 = _mm256_mul_ps(c12_00,FF);
371 vvdw = _mm256_add_ps(vvdw12,vvdw6);
372 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
374 /* Update potential sum for this i atom from the interaction with this j atom. */
375 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
379 /* Calculate temporary vectorial force */
380 tx = _mm256_mul_ps(fscal,dx00);
381 ty = _mm256_mul_ps(fscal,dy00);
382 tz = _mm256_mul_ps(fscal,dz00);
384 /* Update vectorial force */
385 fix0 = _mm256_add_ps(fix0,tx);
386 fiy0 = _mm256_add_ps(fiy0,ty);
387 fiz0 = _mm256_add_ps(fiz0,tz);
389 fjx0 = _mm256_add_ps(fjx0,tx);
390 fjy0 = _mm256_add_ps(fjy0,ty);
391 fjz0 = _mm256_add_ps(fjz0,tz);
393 /**************************
394 * CALCULATE INTERACTIONS *
395 **************************/
397 /* COULOMB ELECTROSTATICS */
398 velec = _mm256_mul_ps(qq11,rinv11);
399 felec = _mm256_mul_ps(velec,rinvsq11);
401 /* Update potential sum for this i atom from the interaction with this j atom. */
402 velecsum = _mm256_add_ps(velecsum,velec);
406 /* Calculate temporary vectorial force */
407 tx = _mm256_mul_ps(fscal,dx11);
408 ty = _mm256_mul_ps(fscal,dy11);
409 tz = _mm256_mul_ps(fscal,dz11);
411 /* Update vectorial force */
412 fix1 = _mm256_add_ps(fix1,tx);
413 fiy1 = _mm256_add_ps(fiy1,ty);
414 fiz1 = _mm256_add_ps(fiz1,tz);
416 fjx1 = _mm256_add_ps(fjx1,tx);
417 fjy1 = _mm256_add_ps(fjy1,ty);
418 fjz1 = _mm256_add_ps(fjz1,tz);
420 /**************************
421 * CALCULATE INTERACTIONS *
422 **************************/
424 /* COULOMB ELECTROSTATICS */
425 velec = _mm256_mul_ps(qq12,rinv12);
426 felec = _mm256_mul_ps(velec,rinvsq12);
428 /* Update potential sum for this i atom from the interaction with this j atom. */
429 velecsum = _mm256_add_ps(velecsum,velec);
433 /* Calculate temporary vectorial force */
434 tx = _mm256_mul_ps(fscal,dx12);
435 ty = _mm256_mul_ps(fscal,dy12);
436 tz = _mm256_mul_ps(fscal,dz12);
438 /* Update vectorial force */
439 fix1 = _mm256_add_ps(fix1,tx);
440 fiy1 = _mm256_add_ps(fiy1,ty);
441 fiz1 = _mm256_add_ps(fiz1,tz);
443 fjx2 = _mm256_add_ps(fjx2,tx);
444 fjy2 = _mm256_add_ps(fjy2,ty);
445 fjz2 = _mm256_add_ps(fjz2,tz);
447 /**************************
448 * CALCULATE INTERACTIONS *
449 **************************/
451 /* COULOMB ELECTROSTATICS */
452 velec = _mm256_mul_ps(qq13,rinv13);
453 felec = _mm256_mul_ps(velec,rinvsq13);
455 /* Update potential sum for this i atom from the interaction with this j atom. */
456 velecsum = _mm256_add_ps(velecsum,velec);
460 /* Calculate temporary vectorial force */
461 tx = _mm256_mul_ps(fscal,dx13);
462 ty = _mm256_mul_ps(fscal,dy13);
463 tz = _mm256_mul_ps(fscal,dz13);
465 /* Update vectorial force */
466 fix1 = _mm256_add_ps(fix1,tx);
467 fiy1 = _mm256_add_ps(fiy1,ty);
468 fiz1 = _mm256_add_ps(fiz1,tz);
470 fjx3 = _mm256_add_ps(fjx3,tx);
471 fjy3 = _mm256_add_ps(fjy3,ty);
472 fjz3 = _mm256_add_ps(fjz3,tz);
474 /**************************
475 * CALCULATE INTERACTIONS *
476 **************************/
478 /* COULOMB ELECTROSTATICS */
479 velec = _mm256_mul_ps(qq21,rinv21);
480 felec = _mm256_mul_ps(velec,rinvsq21);
482 /* Update potential sum for this i atom from the interaction with this j atom. */
483 velecsum = _mm256_add_ps(velecsum,velec);
487 /* Calculate temporary vectorial force */
488 tx = _mm256_mul_ps(fscal,dx21);
489 ty = _mm256_mul_ps(fscal,dy21);
490 tz = _mm256_mul_ps(fscal,dz21);
492 /* Update vectorial force */
493 fix2 = _mm256_add_ps(fix2,tx);
494 fiy2 = _mm256_add_ps(fiy2,ty);
495 fiz2 = _mm256_add_ps(fiz2,tz);
497 fjx1 = _mm256_add_ps(fjx1,tx);
498 fjy1 = _mm256_add_ps(fjy1,ty);
499 fjz1 = _mm256_add_ps(fjz1,tz);
501 /**************************
502 * CALCULATE INTERACTIONS *
503 **************************/
505 /* COULOMB ELECTROSTATICS */
506 velec = _mm256_mul_ps(qq22,rinv22);
507 felec = _mm256_mul_ps(velec,rinvsq22);
509 /* Update potential sum for this i atom from the interaction with this j atom. */
510 velecsum = _mm256_add_ps(velecsum,velec);
514 /* Calculate temporary vectorial force */
515 tx = _mm256_mul_ps(fscal,dx22);
516 ty = _mm256_mul_ps(fscal,dy22);
517 tz = _mm256_mul_ps(fscal,dz22);
519 /* Update vectorial force */
520 fix2 = _mm256_add_ps(fix2,tx);
521 fiy2 = _mm256_add_ps(fiy2,ty);
522 fiz2 = _mm256_add_ps(fiz2,tz);
524 fjx2 = _mm256_add_ps(fjx2,tx);
525 fjy2 = _mm256_add_ps(fjy2,ty);
526 fjz2 = _mm256_add_ps(fjz2,tz);
528 /**************************
529 * CALCULATE INTERACTIONS *
530 **************************/
532 /* COULOMB ELECTROSTATICS */
533 velec = _mm256_mul_ps(qq23,rinv23);
534 felec = _mm256_mul_ps(velec,rinvsq23);
536 /* Update potential sum for this i atom from the interaction with this j atom. */
537 velecsum = _mm256_add_ps(velecsum,velec);
541 /* Calculate temporary vectorial force */
542 tx = _mm256_mul_ps(fscal,dx23);
543 ty = _mm256_mul_ps(fscal,dy23);
544 tz = _mm256_mul_ps(fscal,dz23);
546 /* Update vectorial force */
547 fix2 = _mm256_add_ps(fix2,tx);
548 fiy2 = _mm256_add_ps(fiy2,ty);
549 fiz2 = _mm256_add_ps(fiz2,tz);
551 fjx3 = _mm256_add_ps(fjx3,tx);
552 fjy3 = _mm256_add_ps(fjy3,ty);
553 fjz3 = _mm256_add_ps(fjz3,tz);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 /* COULOMB ELECTROSTATICS */
560 velec = _mm256_mul_ps(qq31,rinv31);
561 felec = _mm256_mul_ps(velec,rinvsq31);
563 /* Update potential sum for this i atom from the interaction with this j atom. */
564 velecsum = _mm256_add_ps(velecsum,velec);
568 /* Calculate temporary vectorial force */
569 tx = _mm256_mul_ps(fscal,dx31);
570 ty = _mm256_mul_ps(fscal,dy31);
571 tz = _mm256_mul_ps(fscal,dz31);
573 /* Update vectorial force */
574 fix3 = _mm256_add_ps(fix3,tx);
575 fiy3 = _mm256_add_ps(fiy3,ty);
576 fiz3 = _mm256_add_ps(fiz3,tz);
578 fjx1 = _mm256_add_ps(fjx1,tx);
579 fjy1 = _mm256_add_ps(fjy1,ty);
580 fjz1 = _mm256_add_ps(fjz1,tz);
582 /**************************
583 * CALCULATE INTERACTIONS *
584 **************************/
586 /* COULOMB ELECTROSTATICS */
587 velec = _mm256_mul_ps(qq32,rinv32);
588 felec = _mm256_mul_ps(velec,rinvsq32);
590 /* Update potential sum for this i atom from the interaction with this j atom. */
591 velecsum = _mm256_add_ps(velecsum,velec);
595 /* Calculate temporary vectorial force */
596 tx = _mm256_mul_ps(fscal,dx32);
597 ty = _mm256_mul_ps(fscal,dy32);
598 tz = _mm256_mul_ps(fscal,dz32);
600 /* Update vectorial force */
601 fix3 = _mm256_add_ps(fix3,tx);
602 fiy3 = _mm256_add_ps(fiy3,ty);
603 fiz3 = _mm256_add_ps(fiz3,tz);
605 fjx2 = _mm256_add_ps(fjx2,tx);
606 fjy2 = _mm256_add_ps(fjy2,ty);
607 fjz2 = _mm256_add_ps(fjz2,tz);
609 /**************************
610 * CALCULATE INTERACTIONS *
611 **************************/
613 /* COULOMB ELECTROSTATICS */
614 velec = _mm256_mul_ps(qq33,rinv33);
615 felec = _mm256_mul_ps(velec,rinvsq33);
617 /* Update potential sum for this i atom from the interaction with this j atom. */
618 velecsum = _mm256_add_ps(velecsum,velec);
622 /* Calculate temporary vectorial force */
623 tx = _mm256_mul_ps(fscal,dx33);
624 ty = _mm256_mul_ps(fscal,dy33);
625 tz = _mm256_mul_ps(fscal,dz33);
627 /* Update vectorial force */
628 fix3 = _mm256_add_ps(fix3,tx);
629 fiy3 = _mm256_add_ps(fiy3,ty);
630 fiz3 = _mm256_add_ps(fiz3,tz);
632 fjx3 = _mm256_add_ps(fjx3,tx);
633 fjy3 = _mm256_add_ps(fjy3,ty);
634 fjz3 = _mm256_add_ps(fjz3,tz);
636 fjptrA = f+j_coord_offsetA;
637 fjptrB = f+j_coord_offsetB;
638 fjptrC = f+j_coord_offsetC;
639 fjptrD = f+j_coord_offsetD;
640 fjptrE = f+j_coord_offsetE;
641 fjptrF = f+j_coord_offsetF;
642 fjptrG = f+j_coord_offsetG;
643 fjptrH = f+j_coord_offsetH;
645 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
646 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
647 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
649 /* Inner loop uses 302 flops */
655 /* Get j neighbor index, and coordinate index */
656 jnrlistA = jjnr[jidx];
657 jnrlistB = jjnr[jidx+1];
658 jnrlistC = jjnr[jidx+2];
659 jnrlistD = jjnr[jidx+3];
660 jnrlistE = jjnr[jidx+4];
661 jnrlistF = jjnr[jidx+5];
662 jnrlistG = jjnr[jidx+6];
663 jnrlistH = jjnr[jidx+7];
664 /* Sign of each element will be negative for non-real atoms.
665 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
666 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
668 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
669 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
671 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
672 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
673 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
674 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
675 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
676 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
677 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
678 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
679 j_coord_offsetA = DIM*jnrA;
680 j_coord_offsetB = DIM*jnrB;
681 j_coord_offsetC = DIM*jnrC;
682 j_coord_offsetD = DIM*jnrD;
683 j_coord_offsetE = DIM*jnrE;
684 j_coord_offsetF = DIM*jnrF;
685 j_coord_offsetG = DIM*jnrG;
686 j_coord_offsetH = DIM*jnrH;
688 /* load j atom coordinates */
689 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
690 x+j_coord_offsetC,x+j_coord_offsetD,
691 x+j_coord_offsetE,x+j_coord_offsetF,
692 x+j_coord_offsetG,x+j_coord_offsetH,
693 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
694 &jy2,&jz2,&jx3,&jy3,&jz3);
696 /* Calculate displacement vector */
697 dx00 = _mm256_sub_ps(ix0,jx0);
698 dy00 = _mm256_sub_ps(iy0,jy0);
699 dz00 = _mm256_sub_ps(iz0,jz0);
700 dx11 = _mm256_sub_ps(ix1,jx1);
701 dy11 = _mm256_sub_ps(iy1,jy1);
702 dz11 = _mm256_sub_ps(iz1,jz1);
703 dx12 = _mm256_sub_ps(ix1,jx2);
704 dy12 = _mm256_sub_ps(iy1,jy2);
705 dz12 = _mm256_sub_ps(iz1,jz2);
706 dx13 = _mm256_sub_ps(ix1,jx3);
707 dy13 = _mm256_sub_ps(iy1,jy3);
708 dz13 = _mm256_sub_ps(iz1,jz3);
709 dx21 = _mm256_sub_ps(ix2,jx1);
710 dy21 = _mm256_sub_ps(iy2,jy1);
711 dz21 = _mm256_sub_ps(iz2,jz1);
712 dx22 = _mm256_sub_ps(ix2,jx2);
713 dy22 = _mm256_sub_ps(iy2,jy2);
714 dz22 = _mm256_sub_ps(iz2,jz2);
715 dx23 = _mm256_sub_ps(ix2,jx3);
716 dy23 = _mm256_sub_ps(iy2,jy3);
717 dz23 = _mm256_sub_ps(iz2,jz3);
718 dx31 = _mm256_sub_ps(ix3,jx1);
719 dy31 = _mm256_sub_ps(iy3,jy1);
720 dz31 = _mm256_sub_ps(iz3,jz1);
721 dx32 = _mm256_sub_ps(ix3,jx2);
722 dy32 = _mm256_sub_ps(iy3,jy2);
723 dz32 = _mm256_sub_ps(iz3,jz2);
724 dx33 = _mm256_sub_ps(ix3,jx3);
725 dy33 = _mm256_sub_ps(iy3,jy3);
726 dz33 = _mm256_sub_ps(iz3,jz3);
728 /* Calculate squared distance and things based on it */
729 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
730 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
731 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
732 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
733 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
734 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
735 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
736 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
737 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
738 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
740 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
741 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
742 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
743 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
744 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
745 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
746 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
747 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
748 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
749 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
751 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
752 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
753 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
754 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
755 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
756 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
757 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
758 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
759 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
761 fjx0 = _mm256_setzero_ps();
762 fjy0 = _mm256_setzero_ps();
763 fjz0 = _mm256_setzero_ps();
764 fjx1 = _mm256_setzero_ps();
765 fjy1 = _mm256_setzero_ps();
766 fjz1 = _mm256_setzero_ps();
767 fjx2 = _mm256_setzero_ps();
768 fjy2 = _mm256_setzero_ps();
769 fjz2 = _mm256_setzero_ps();
770 fjx3 = _mm256_setzero_ps();
771 fjy3 = _mm256_setzero_ps();
772 fjz3 = _mm256_setzero_ps();
774 /**************************
775 * CALCULATE INTERACTIONS *
776 **************************/
778 r00 = _mm256_mul_ps(rsq00,rinv00);
779 r00 = _mm256_andnot_ps(dummy_mask,r00);
781 /* Calculate table index by multiplying r with table scale and truncate to integer */
782 rt = _mm256_mul_ps(r00,vftabscale);
783 vfitab = _mm256_cvttps_epi32(rt);
784 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
785 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
786 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
787 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
788 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
789 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
791 /* CUBIC SPLINE TABLE DISPERSION */
792 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
793 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
794 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
795 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
796 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
797 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
798 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
799 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
800 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
801 Heps = _mm256_mul_ps(vfeps,H);
802 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
803 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
804 vvdw6 = _mm256_mul_ps(c6_00,VV);
805 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
806 fvdw6 = _mm256_mul_ps(c6_00,FF);
808 /* CUBIC SPLINE TABLE REPULSION */
809 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
810 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
811 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
812 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
813 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
814 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
815 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
816 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
817 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
818 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
819 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
820 Heps = _mm256_mul_ps(vfeps,H);
821 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
822 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
823 vvdw12 = _mm256_mul_ps(c12_00,VV);
824 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
825 fvdw12 = _mm256_mul_ps(c12_00,FF);
826 vvdw = _mm256_add_ps(vvdw12,vvdw6);
827 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
829 /* Update potential sum for this i atom from the interaction with this j atom. */
830 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
831 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
835 fscal = _mm256_andnot_ps(dummy_mask,fscal);
837 /* Calculate temporary vectorial force */
838 tx = _mm256_mul_ps(fscal,dx00);
839 ty = _mm256_mul_ps(fscal,dy00);
840 tz = _mm256_mul_ps(fscal,dz00);
842 /* Update vectorial force */
843 fix0 = _mm256_add_ps(fix0,tx);
844 fiy0 = _mm256_add_ps(fiy0,ty);
845 fiz0 = _mm256_add_ps(fiz0,tz);
847 fjx0 = _mm256_add_ps(fjx0,tx);
848 fjy0 = _mm256_add_ps(fjy0,ty);
849 fjz0 = _mm256_add_ps(fjz0,tz);
851 /**************************
852 * CALCULATE INTERACTIONS *
853 **************************/
855 /* COULOMB ELECTROSTATICS */
856 velec = _mm256_mul_ps(qq11,rinv11);
857 felec = _mm256_mul_ps(velec,rinvsq11);
859 /* Update potential sum for this i atom from the interaction with this j atom. */
860 velec = _mm256_andnot_ps(dummy_mask,velec);
861 velecsum = _mm256_add_ps(velecsum,velec);
865 fscal = _mm256_andnot_ps(dummy_mask,fscal);
867 /* Calculate temporary vectorial force */
868 tx = _mm256_mul_ps(fscal,dx11);
869 ty = _mm256_mul_ps(fscal,dy11);
870 tz = _mm256_mul_ps(fscal,dz11);
872 /* Update vectorial force */
873 fix1 = _mm256_add_ps(fix1,tx);
874 fiy1 = _mm256_add_ps(fiy1,ty);
875 fiz1 = _mm256_add_ps(fiz1,tz);
877 fjx1 = _mm256_add_ps(fjx1,tx);
878 fjy1 = _mm256_add_ps(fjy1,ty);
879 fjz1 = _mm256_add_ps(fjz1,tz);
881 /**************************
882 * CALCULATE INTERACTIONS *
883 **************************/
885 /* COULOMB ELECTROSTATICS */
886 velec = _mm256_mul_ps(qq12,rinv12);
887 felec = _mm256_mul_ps(velec,rinvsq12);
889 /* Update potential sum for this i atom from the interaction with this j atom. */
890 velec = _mm256_andnot_ps(dummy_mask,velec);
891 velecsum = _mm256_add_ps(velecsum,velec);
895 fscal = _mm256_andnot_ps(dummy_mask,fscal);
897 /* Calculate temporary vectorial force */
898 tx = _mm256_mul_ps(fscal,dx12);
899 ty = _mm256_mul_ps(fscal,dy12);
900 tz = _mm256_mul_ps(fscal,dz12);
902 /* Update vectorial force */
903 fix1 = _mm256_add_ps(fix1,tx);
904 fiy1 = _mm256_add_ps(fiy1,ty);
905 fiz1 = _mm256_add_ps(fiz1,tz);
907 fjx2 = _mm256_add_ps(fjx2,tx);
908 fjy2 = _mm256_add_ps(fjy2,ty);
909 fjz2 = _mm256_add_ps(fjz2,tz);
911 /**************************
912 * CALCULATE INTERACTIONS *
913 **************************/
915 /* COULOMB ELECTROSTATICS */
916 velec = _mm256_mul_ps(qq13,rinv13);
917 felec = _mm256_mul_ps(velec,rinvsq13);
919 /* Update potential sum for this i atom from the interaction with this j atom. */
920 velec = _mm256_andnot_ps(dummy_mask,velec);
921 velecsum = _mm256_add_ps(velecsum,velec);
925 fscal = _mm256_andnot_ps(dummy_mask,fscal);
927 /* Calculate temporary vectorial force */
928 tx = _mm256_mul_ps(fscal,dx13);
929 ty = _mm256_mul_ps(fscal,dy13);
930 tz = _mm256_mul_ps(fscal,dz13);
932 /* Update vectorial force */
933 fix1 = _mm256_add_ps(fix1,tx);
934 fiy1 = _mm256_add_ps(fiy1,ty);
935 fiz1 = _mm256_add_ps(fiz1,tz);
937 fjx3 = _mm256_add_ps(fjx3,tx);
938 fjy3 = _mm256_add_ps(fjy3,ty);
939 fjz3 = _mm256_add_ps(fjz3,tz);
941 /**************************
942 * CALCULATE INTERACTIONS *
943 **************************/
945 /* COULOMB ELECTROSTATICS */
946 velec = _mm256_mul_ps(qq21,rinv21);
947 felec = _mm256_mul_ps(velec,rinvsq21);
949 /* Update potential sum for this i atom from the interaction with this j atom. */
950 velec = _mm256_andnot_ps(dummy_mask,velec);
951 velecsum = _mm256_add_ps(velecsum,velec);
955 fscal = _mm256_andnot_ps(dummy_mask,fscal);
957 /* Calculate temporary vectorial force */
958 tx = _mm256_mul_ps(fscal,dx21);
959 ty = _mm256_mul_ps(fscal,dy21);
960 tz = _mm256_mul_ps(fscal,dz21);
962 /* Update vectorial force */
963 fix2 = _mm256_add_ps(fix2,tx);
964 fiy2 = _mm256_add_ps(fiy2,ty);
965 fiz2 = _mm256_add_ps(fiz2,tz);
967 fjx1 = _mm256_add_ps(fjx1,tx);
968 fjy1 = _mm256_add_ps(fjy1,ty);
969 fjz1 = _mm256_add_ps(fjz1,tz);
971 /**************************
972 * CALCULATE INTERACTIONS *
973 **************************/
975 /* COULOMB ELECTROSTATICS */
976 velec = _mm256_mul_ps(qq22,rinv22);
977 felec = _mm256_mul_ps(velec,rinvsq22);
979 /* Update potential sum for this i atom from the interaction with this j atom. */
980 velec = _mm256_andnot_ps(dummy_mask,velec);
981 velecsum = _mm256_add_ps(velecsum,velec);
985 fscal = _mm256_andnot_ps(dummy_mask,fscal);
987 /* Calculate temporary vectorial force */
988 tx = _mm256_mul_ps(fscal,dx22);
989 ty = _mm256_mul_ps(fscal,dy22);
990 tz = _mm256_mul_ps(fscal,dz22);
992 /* Update vectorial force */
993 fix2 = _mm256_add_ps(fix2,tx);
994 fiy2 = _mm256_add_ps(fiy2,ty);
995 fiz2 = _mm256_add_ps(fiz2,tz);
997 fjx2 = _mm256_add_ps(fjx2,tx);
998 fjy2 = _mm256_add_ps(fjy2,ty);
999 fjz2 = _mm256_add_ps(fjz2,tz);
1001 /**************************
1002 * CALCULATE INTERACTIONS *
1003 **************************/
1005 /* COULOMB ELECTROSTATICS */
1006 velec = _mm256_mul_ps(qq23,rinv23);
1007 felec = _mm256_mul_ps(velec,rinvsq23);
1009 /* Update potential sum for this i atom from the interaction with this j atom. */
1010 velec = _mm256_andnot_ps(dummy_mask,velec);
1011 velecsum = _mm256_add_ps(velecsum,velec);
1015 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1017 /* Calculate temporary vectorial force */
1018 tx = _mm256_mul_ps(fscal,dx23);
1019 ty = _mm256_mul_ps(fscal,dy23);
1020 tz = _mm256_mul_ps(fscal,dz23);
1022 /* Update vectorial force */
1023 fix2 = _mm256_add_ps(fix2,tx);
1024 fiy2 = _mm256_add_ps(fiy2,ty);
1025 fiz2 = _mm256_add_ps(fiz2,tz);
1027 fjx3 = _mm256_add_ps(fjx3,tx);
1028 fjy3 = _mm256_add_ps(fjy3,ty);
1029 fjz3 = _mm256_add_ps(fjz3,tz);
1031 /**************************
1032 * CALCULATE INTERACTIONS *
1033 **************************/
1035 /* COULOMB ELECTROSTATICS */
1036 velec = _mm256_mul_ps(qq31,rinv31);
1037 felec = _mm256_mul_ps(velec,rinvsq31);
1039 /* Update potential sum for this i atom from the interaction with this j atom. */
1040 velec = _mm256_andnot_ps(dummy_mask,velec);
1041 velecsum = _mm256_add_ps(velecsum,velec);
1045 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1047 /* Calculate temporary vectorial force */
1048 tx = _mm256_mul_ps(fscal,dx31);
1049 ty = _mm256_mul_ps(fscal,dy31);
1050 tz = _mm256_mul_ps(fscal,dz31);
1052 /* Update vectorial force */
1053 fix3 = _mm256_add_ps(fix3,tx);
1054 fiy3 = _mm256_add_ps(fiy3,ty);
1055 fiz3 = _mm256_add_ps(fiz3,tz);
1057 fjx1 = _mm256_add_ps(fjx1,tx);
1058 fjy1 = _mm256_add_ps(fjy1,ty);
1059 fjz1 = _mm256_add_ps(fjz1,tz);
1061 /**************************
1062 * CALCULATE INTERACTIONS *
1063 **************************/
1065 /* COULOMB ELECTROSTATICS */
1066 velec = _mm256_mul_ps(qq32,rinv32);
1067 felec = _mm256_mul_ps(velec,rinvsq32);
1069 /* Update potential sum for this i atom from the interaction with this j atom. */
1070 velec = _mm256_andnot_ps(dummy_mask,velec);
1071 velecsum = _mm256_add_ps(velecsum,velec);
1075 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1077 /* Calculate temporary vectorial force */
1078 tx = _mm256_mul_ps(fscal,dx32);
1079 ty = _mm256_mul_ps(fscal,dy32);
1080 tz = _mm256_mul_ps(fscal,dz32);
1082 /* Update vectorial force */
1083 fix3 = _mm256_add_ps(fix3,tx);
1084 fiy3 = _mm256_add_ps(fiy3,ty);
1085 fiz3 = _mm256_add_ps(fiz3,tz);
1087 fjx2 = _mm256_add_ps(fjx2,tx);
1088 fjy2 = _mm256_add_ps(fjy2,ty);
1089 fjz2 = _mm256_add_ps(fjz2,tz);
1091 /**************************
1092 * CALCULATE INTERACTIONS *
1093 **************************/
1095 /* COULOMB ELECTROSTATICS */
1096 velec = _mm256_mul_ps(qq33,rinv33);
1097 felec = _mm256_mul_ps(velec,rinvsq33);
1099 /* Update potential sum for this i atom from the interaction with this j atom. */
1100 velec = _mm256_andnot_ps(dummy_mask,velec);
1101 velecsum = _mm256_add_ps(velecsum,velec);
1105 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1107 /* Calculate temporary vectorial force */
1108 tx = _mm256_mul_ps(fscal,dx33);
1109 ty = _mm256_mul_ps(fscal,dy33);
1110 tz = _mm256_mul_ps(fscal,dz33);
1112 /* Update vectorial force */
1113 fix3 = _mm256_add_ps(fix3,tx);
1114 fiy3 = _mm256_add_ps(fiy3,ty);
1115 fiz3 = _mm256_add_ps(fiz3,tz);
1117 fjx3 = _mm256_add_ps(fjx3,tx);
1118 fjy3 = _mm256_add_ps(fjy3,ty);
1119 fjz3 = _mm256_add_ps(fjz3,tz);
1121 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1122 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1123 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1124 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1125 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1126 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1127 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1128 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1130 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1131 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1132 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1134 /* Inner loop uses 303 flops */
1137 /* End of innermost loop */
1139 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1140 f+i_coord_offset,fshift+i_shift_offset);
1143 /* Update potential energies */
1144 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1145 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1147 /* Increment number of inner iterations */
1148 inneriter += j_index_end - j_index_start;
1150 /* Outer loop uses 26 flops */
1153 /* Increment number of outer iterations */
1156 /* Update outer/inner flops */
1158 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*303);
1161 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_256_single
1162 * Electrostatics interaction: Coulomb
1163 * VdW interaction: CubicSplineTable
1164 * Geometry: Water4-Water4
1165 * Calculate force/pot: Force
1168 nb_kernel_ElecCoul_VdwCSTab_GeomW4W4_F_avx_256_single
1169 (t_nblist * gmx_restrict nlist,
1170 rvec * gmx_restrict xx,
1171 rvec * gmx_restrict ff,
1172 t_forcerec * gmx_restrict fr,
1173 t_mdatoms * gmx_restrict mdatoms,
1174 nb_kernel_data_t * gmx_restrict kernel_data,
1175 t_nrnb * gmx_restrict nrnb)
1177 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1178 * just 0 for non-waters.
1179 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1180 * jnr indices corresponding to data put in the four positions in the SIMD register.
1182 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1183 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1184 int jnrA,jnrB,jnrC,jnrD;
1185 int jnrE,jnrF,jnrG,jnrH;
1186 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1187 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1188 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1189 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1190 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1191 real rcutoff_scalar;
1192 real *shiftvec,*fshift,*x,*f;
1193 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1194 real scratch[4*DIM];
1195 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1196 real * vdwioffsetptr0;
1197 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1198 real * vdwioffsetptr1;
1199 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1200 real * vdwioffsetptr2;
1201 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1202 real * vdwioffsetptr3;
1203 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1204 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1205 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1206 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1207 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1208 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1209 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1210 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1211 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1212 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1213 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1214 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1215 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1216 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1217 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1218 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1219 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1220 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1221 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1222 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1225 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1228 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1229 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1231 __m128i vfitab_lo,vfitab_hi;
1232 __m128i ifour = _mm_set1_epi32(4);
1233 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1235 __m256 dummy_mask,cutoff_mask;
1236 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1237 __m256 one = _mm256_set1_ps(1.0);
1238 __m256 two = _mm256_set1_ps(2.0);
1244 jindex = nlist->jindex;
1246 shiftidx = nlist->shift;
1248 shiftvec = fr->shift_vec[0];
1249 fshift = fr->fshift[0];
1250 facel = _mm256_set1_ps(fr->epsfac);
1251 charge = mdatoms->chargeA;
1252 nvdwtype = fr->ntype;
1253 vdwparam = fr->nbfp;
1254 vdwtype = mdatoms->typeA;
1256 vftab = kernel_data->table_vdw->data;
1257 vftabscale = _mm256_set1_ps(kernel_data->table_vdw->scale);
1259 /* Setup water-specific parameters */
1260 inr = nlist->iinr[0];
1261 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1262 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1263 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1264 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1266 jq1 = _mm256_set1_ps(charge[inr+1]);
1267 jq2 = _mm256_set1_ps(charge[inr+2]);
1268 jq3 = _mm256_set1_ps(charge[inr+3]);
1269 vdwjidx0A = 2*vdwtype[inr+0];
1270 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1271 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1272 qq11 = _mm256_mul_ps(iq1,jq1);
1273 qq12 = _mm256_mul_ps(iq1,jq2);
1274 qq13 = _mm256_mul_ps(iq1,jq3);
1275 qq21 = _mm256_mul_ps(iq2,jq1);
1276 qq22 = _mm256_mul_ps(iq2,jq2);
1277 qq23 = _mm256_mul_ps(iq2,jq3);
1278 qq31 = _mm256_mul_ps(iq3,jq1);
1279 qq32 = _mm256_mul_ps(iq3,jq2);
1280 qq33 = _mm256_mul_ps(iq3,jq3);
1282 /* Avoid stupid compiler warnings */
1283 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1284 j_coord_offsetA = 0;
1285 j_coord_offsetB = 0;
1286 j_coord_offsetC = 0;
1287 j_coord_offsetD = 0;
1288 j_coord_offsetE = 0;
1289 j_coord_offsetF = 0;
1290 j_coord_offsetG = 0;
1291 j_coord_offsetH = 0;
1296 for(iidx=0;iidx<4*DIM;iidx++)
1298 scratch[iidx] = 0.0;
1301 /* Start outer loop over neighborlists */
1302 for(iidx=0; iidx<nri; iidx++)
1304 /* Load shift vector for this list */
1305 i_shift_offset = DIM*shiftidx[iidx];
1307 /* Load limits for loop over neighbors */
1308 j_index_start = jindex[iidx];
1309 j_index_end = jindex[iidx+1];
1311 /* Get outer coordinate index */
1313 i_coord_offset = DIM*inr;
1315 /* Load i particle coords and add shift vector */
1316 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1317 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1319 fix0 = _mm256_setzero_ps();
1320 fiy0 = _mm256_setzero_ps();
1321 fiz0 = _mm256_setzero_ps();
1322 fix1 = _mm256_setzero_ps();
1323 fiy1 = _mm256_setzero_ps();
1324 fiz1 = _mm256_setzero_ps();
1325 fix2 = _mm256_setzero_ps();
1326 fiy2 = _mm256_setzero_ps();
1327 fiz2 = _mm256_setzero_ps();
1328 fix3 = _mm256_setzero_ps();
1329 fiy3 = _mm256_setzero_ps();
1330 fiz3 = _mm256_setzero_ps();
1332 /* Start inner kernel loop */
1333 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1336 /* Get j neighbor index, and coordinate index */
1338 jnrB = jjnr[jidx+1];
1339 jnrC = jjnr[jidx+2];
1340 jnrD = jjnr[jidx+3];
1341 jnrE = jjnr[jidx+4];
1342 jnrF = jjnr[jidx+5];
1343 jnrG = jjnr[jidx+6];
1344 jnrH = jjnr[jidx+7];
1345 j_coord_offsetA = DIM*jnrA;
1346 j_coord_offsetB = DIM*jnrB;
1347 j_coord_offsetC = DIM*jnrC;
1348 j_coord_offsetD = DIM*jnrD;
1349 j_coord_offsetE = DIM*jnrE;
1350 j_coord_offsetF = DIM*jnrF;
1351 j_coord_offsetG = DIM*jnrG;
1352 j_coord_offsetH = DIM*jnrH;
1354 /* load j atom coordinates */
1355 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1356 x+j_coord_offsetC,x+j_coord_offsetD,
1357 x+j_coord_offsetE,x+j_coord_offsetF,
1358 x+j_coord_offsetG,x+j_coord_offsetH,
1359 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1360 &jy2,&jz2,&jx3,&jy3,&jz3);
1362 /* Calculate displacement vector */
1363 dx00 = _mm256_sub_ps(ix0,jx0);
1364 dy00 = _mm256_sub_ps(iy0,jy0);
1365 dz00 = _mm256_sub_ps(iz0,jz0);
1366 dx11 = _mm256_sub_ps(ix1,jx1);
1367 dy11 = _mm256_sub_ps(iy1,jy1);
1368 dz11 = _mm256_sub_ps(iz1,jz1);
1369 dx12 = _mm256_sub_ps(ix1,jx2);
1370 dy12 = _mm256_sub_ps(iy1,jy2);
1371 dz12 = _mm256_sub_ps(iz1,jz2);
1372 dx13 = _mm256_sub_ps(ix1,jx3);
1373 dy13 = _mm256_sub_ps(iy1,jy3);
1374 dz13 = _mm256_sub_ps(iz1,jz3);
1375 dx21 = _mm256_sub_ps(ix2,jx1);
1376 dy21 = _mm256_sub_ps(iy2,jy1);
1377 dz21 = _mm256_sub_ps(iz2,jz1);
1378 dx22 = _mm256_sub_ps(ix2,jx2);
1379 dy22 = _mm256_sub_ps(iy2,jy2);
1380 dz22 = _mm256_sub_ps(iz2,jz2);
1381 dx23 = _mm256_sub_ps(ix2,jx3);
1382 dy23 = _mm256_sub_ps(iy2,jy3);
1383 dz23 = _mm256_sub_ps(iz2,jz3);
1384 dx31 = _mm256_sub_ps(ix3,jx1);
1385 dy31 = _mm256_sub_ps(iy3,jy1);
1386 dz31 = _mm256_sub_ps(iz3,jz1);
1387 dx32 = _mm256_sub_ps(ix3,jx2);
1388 dy32 = _mm256_sub_ps(iy3,jy2);
1389 dz32 = _mm256_sub_ps(iz3,jz2);
1390 dx33 = _mm256_sub_ps(ix3,jx3);
1391 dy33 = _mm256_sub_ps(iy3,jy3);
1392 dz33 = _mm256_sub_ps(iz3,jz3);
1394 /* Calculate squared distance and things based on it */
1395 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1396 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1397 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1398 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1399 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1400 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1401 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1402 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1403 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1404 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1406 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1407 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1408 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1409 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1410 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1411 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1412 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1413 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1414 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1415 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1417 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1418 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1419 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1420 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1421 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1422 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1423 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1424 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1425 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1427 fjx0 = _mm256_setzero_ps();
1428 fjy0 = _mm256_setzero_ps();
1429 fjz0 = _mm256_setzero_ps();
1430 fjx1 = _mm256_setzero_ps();
1431 fjy1 = _mm256_setzero_ps();
1432 fjz1 = _mm256_setzero_ps();
1433 fjx2 = _mm256_setzero_ps();
1434 fjy2 = _mm256_setzero_ps();
1435 fjz2 = _mm256_setzero_ps();
1436 fjx3 = _mm256_setzero_ps();
1437 fjy3 = _mm256_setzero_ps();
1438 fjz3 = _mm256_setzero_ps();
1440 /**************************
1441 * CALCULATE INTERACTIONS *
1442 **************************/
1444 r00 = _mm256_mul_ps(rsq00,rinv00);
1446 /* Calculate table index by multiplying r with table scale and truncate to integer */
1447 rt = _mm256_mul_ps(r00,vftabscale);
1448 vfitab = _mm256_cvttps_epi32(rt);
1449 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1450 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1451 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1452 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1453 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
1454 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
1456 /* CUBIC SPLINE TABLE DISPERSION */
1457 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1458 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1459 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1460 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1461 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1462 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1463 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1464 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1465 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1466 Heps = _mm256_mul_ps(vfeps,H);
1467 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1468 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1469 fvdw6 = _mm256_mul_ps(c6_00,FF);
1471 /* CUBIC SPLINE TABLE REPULSION */
1472 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1473 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1474 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1475 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1476 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1477 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1478 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1479 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1480 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1481 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1482 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1483 Heps = _mm256_mul_ps(vfeps,H);
1484 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1485 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1486 fvdw12 = _mm256_mul_ps(c12_00,FF);
1487 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1491 /* Calculate temporary vectorial force */
1492 tx = _mm256_mul_ps(fscal,dx00);
1493 ty = _mm256_mul_ps(fscal,dy00);
1494 tz = _mm256_mul_ps(fscal,dz00);
1496 /* Update vectorial force */
1497 fix0 = _mm256_add_ps(fix0,tx);
1498 fiy0 = _mm256_add_ps(fiy0,ty);
1499 fiz0 = _mm256_add_ps(fiz0,tz);
1501 fjx0 = _mm256_add_ps(fjx0,tx);
1502 fjy0 = _mm256_add_ps(fjy0,ty);
1503 fjz0 = _mm256_add_ps(fjz0,tz);
1505 /**************************
1506 * CALCULATE INTERACTIONS *
1507 **************************/
1509 /* COULOMB ELECTROSTATICS */
1510 velec = _mm256_mul_ps(qq11,rinv11);
1511 felec = _mm256_mul_ps(velec,rinvsq11);
1515 /* Calculate temporary vectorial force */
1516 tx = _mm256_mul_ps(fscal,dx11);
1517 ty = _mm256_mul_ps(fscal,dy11);
1518 tz = _mm256_mul_ps(fscal,dz11);
1520 /* Update vectorial force */
1521 fix1 = _mm256_add_ps(fix1,tx);
1522 fiy1 = _mm256_add_ps(fiy1,ty);
1523 fiz1 = _mm256_add_ps(fiz1,tz);
1525 fjx1 = _mm256_add_ps(fjx1,tx);
1526 fjy1 = _mm256_add_ps(fjy1,ty);
1527 fjz1 = _mm256_add_ps(fjz1,tz);
1529 /**************************
1530 * CALCULATE INTERACTIONS *
1531 **************************/
1533 /* COULOMB ELECTROSTATICS */
1534 velec = _mm256_mul_ps(qq12,rinv12);
1535 felec = _mm256_mul_ps(velec,rinvsq12);
1539 /* Calculate temporary vectorial force */
1540 tx = _mm256_mul_ps(fscal,dx12);
1541 ty = _mm256_mul_ps(fscal,dy12);
1542 tz = _mm256_mul_ps(fscal,dz12);
1544 /* Update vectorial force */
1545 fix1 = _mm256_add_ps(fix1,tx);
1546 fiy1 = _mm256_add_ps(fiy1,ty);
1547 fiz1 = _mm256_add_ps(fiz1,tz);
1549 fjx2 = _mm256_add_ps(fjx2,tx);
1550 fjy2 = _mm256_add_ps(fjy2,ty);
1551 fjz2 = _mm256_add_ps(fjz2,tz);
1553 /**************************
1554 * CALCULATE INTERACTIONS *
1555 **************************/
1557 /* COULOMB ELECTROSTATICS */
1558 velec = _mm256_mul_ps(qq13,rinv13);
1559 felec = _mm256_mul_ps(velec,rinvsq13);
1563 /* Calculate temporary vectorial force */
1564 tx = _mm256_mul_ps(fscal,dx13);
1565 ty = _mm256_mul_ps(fscal,dy13);
1566 tz = _mm256_mul_ps(fscal,dz13);
1568 /* Update vectorial force */
1569 fix1 = _mm256_add_ps(fix1,tx);
1570 fiy1 = _mm256_add_ps(fiy1,ty);
1571 fiz1 = _mm256_add_ps(fiz1,tz);
1573 fjx3 = _mm256_add_ps(fjx3,tx);
1574 fjy3 = _mm256_add_ps(fjy3,ty);
1575 fjz3 = _mm256_add_ps(fjz3,tz);
1577 /**************************
1578 * CALCULATE INTERACTIONS *
1579 **************************/
1581 /* COULOMB ELECTROSTATICS */
1582 velec = _mm256_mul_ps(qq21,rinv21);
1583 felec = _mm256_mul_ps(velec,rinvsq21);
1587 /* Calculate temporary vectorial force */
1588 tx = _mm256_mul_ps(fscal,dx21);
1589 ty = _mm256_mul_ps(fscal,dy21);
1590 tz = _mm256_mul_ps(fscal,dz21);
1592 /* Update vectorial force */
1593 fix2 = _mm256_add_ps(fix2,tx);
1594 fiy2 = _mm256_add_ps(fiy2,ty);
1595 fiz2 = _mm256_add_ps(fiz2,tz);
1597 fjx1 = _mm256_add_ps(fjx1,tx);
1598 fjy1 = _mm256_add_ps(fjy1,ty);
1599 fjz1 = _mm256_add_ps(fjz1,tz);
1601 /**************************
1602 * CALCULATE INTERACTIONS *
1603 **************************/
1605 /* COULOMB ELECTROSTATICS */
1606 velec = _mm256_mul_ps(qq22,rinv22);
1607 felec = _mm256_mul_ps(velec,rinvsq22);
1611 /* Calculate temporary vectorial force */
1612 tx = _mm256_mul_ps(fscal,dx22);
1613 ty = _mm256_mul_ps(fscal,dy22);
1614 tz = _mm256_mul_ps(fscal,dz22);
1616 /* Update vectorial force */
1617 fix2 = _mm256_add_ps(fix2,tx);
1618 fiy2 = _mm256_add_ps(fiy2,ty);
1619 fiz2 = _mm256_add_ps(fiz2,tz);
1621 fjx2 = _mm256_add_ps(fjx2,tx);
1622 fjy2 = _mm256_add_ps(fjy2,ty);
1623 fjz2 = _mm256_add_ps(fjz2,tz);
1625 /**************************
1626 * CALCULATE INTERACTIONS *
1627 **************************/
1629 /* COULOMB ELECTROSTATICS */
1630 velec = _mm256_mul_ps(qq23,rinv23);
1631 felec = _mm256_mul_ps(velec,rinvsq23);
1635 /* Calculate temporary vectorial force */
1636 tx = _mm256_mul_ps(fscal,dx23);
1637 ty = _mm256_mul_ps(fscal,dy23);
1638 tz = _mm256_mul_ps(fscal,dz23);
1640 /* Update vectorial force */
1641 fix2 = _mm256_add_ps(fix2,tx);
1642 fiy2 = _mm256_add_ps(fiy2,ty);
1643 fiz2 = _mm256_add_ps(fiz2,tz);
1645 fjx3 = _mm256_add_ps(fjx3,tx);
1646 fjy3 = _mm256_add_ps(fjy3,ty);
1647 fjz3 = _mm256_add_ps(fjz3,tz);
1649 /**************************
1650 * CALCULATE INTERACTIONS *
1651 **************************/
1653 /* COULOMB ELECTROSTATICS */
1654 velec = _mm256_mul_ps(qq31,rinv31);
1655 felec = _mm256_mul_ps(velec,rinvsq31);
1659 /* Calculate temporary vectorial force */
1660 tx = _mm256_mul_ps(fscal,dx31);
1661 ty = _mm256_mul_ps(fscal,dy31);
1662 tz = _mm256_mul_ps(fscal,dz31);
1664 /* Update vectorial force */
1665 fix3 = _mm256_add_ps(fix3,tx);
1666 fiy3 = _mm256_add_ps(fiy3,ty);
1667 fiz3 = _mm256_add_ps(fiz3,tz);
1669 fjx1 = _mm256_add_ps(fjx1,tx);
1670 fjy1 = _mm256_add_ps(fjy1,ty);
1671 fjz1 = _mm256_add_ps(fjz1,tz);
1673 /**************************
1674 * CALCULATE INTERACTIONS *
1675 **************************/
1677 /* COULOMB ELECTROSTATICS */
1678 velec = _mm256_mul_ps(qq32,rinv32);
1679 felec = _mm256_mul_ps(velec,rinvsq32);
1683 /* Calculate temporary vectorial force */
1684 tx = _mm256_mul_ps(fscal,dx32);
1685 ty = _mm256_mul_ps(fscal,dy32);
1686 tz = _mm256_mul_ps(fscal,dz32);
1688 /* Update vectorial force */
1689 fix3 = _mm256_add_ps(fix3,tx);
1690 fiy3 = _mm256_add_ps(fiy3,ty);
1691 fiz3 = _mm256_add_ps(fiz3,tz);
1693 fjx2 = _mm256_add_ps(fjx2,tx);
1694 fjy2 = _mm256_add_ps(fjy2,ty);
1695 fjz2 = _mm256_add_ps(fjz2,tz);
1697 /**************************
1698 * CALCULATE INTERACTIONS *
1699 **************************/
1701 /* COULOMB ELECTROSTATICS */
1702 velec = _mm256_mul_ps(qq33,rinv33);
1703 felec = _mm256_mul_ps(velec,rinvsq33);
1707 /* Calculate temporary vectorial force */
1708 tx = _mm256_mul_ps(fscal,dx33);
1709 ty = _mm256_mul_ps(fscal,dy33);
1710 tz = _mm256_mul_ps(fscal,dz33);
1712 /* Update vectorial force */
1713 fix3 = _mm256_add_ps(fix3,tx);
1714 fiy3 = _mm256_add_ps(fiy3,ty);
1715 fiz3 = _mm256_add_ps(fiz3,tz);
1717 fjx3 = _mm256_add_ps(fjx3,tx);
1718 fjy3 = _mm256_add_ps(fjy3,ty);
1719 fjz3 = _mm256_add_ps(fjz3,tz);
1721 fjptrA = f+j_coord_offsetA;
1722 fjptrB = f+j_coord_offsetB;
1723 fjptrC = f+j_coord_offsetC;
1724 fjptrD = f+j_coord_offsetD;
1725 fjptrE = f+j_coord_offsetE;
1726 fjptrF = f+j_coord_offsetF;
1727 fjptrG = f+j_coord_offsetG;
1728 fjptrH = f+j_coord_offsetH;
1730 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1731 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1732 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1734 /* Inner loop uses 285 flops */
1737 if(jidx<j_index_end)
1740 /* Get j neighbor index, and coordinate index */
1741 jnrlistA = jjnr[jidx];
1742 jnrlistB = jjnr[jidx+1];
1743 jnrlistC = jjnr[jidx+2];
1744 jnrlistD = jjnr[jidx+3];
1745 jnrlistE = jjnr[jidx+4];
1746 jnrlistF = jjnr[jidx+5];
1747 jnrlistG = jjnr[jidx+6];
1748 jnrlistH = jjnr[jidx+7];
1749 /* Sign of each element will be negative for non-real atoms.
1750 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1751 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1753 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1754 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1756 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1757 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1758 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1759 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1760 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1761 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1762 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1763 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1764 j_coord_offsetA = DIM*jnrA;
1765 j_coord_offsetB = DIM*jnrB;
1766 j_coord_offsetC = DIM*jnrC;
1767 j_coord_offsetD = DIM*jnrD;
1768 j_coord_offsetE = DIM*jnrE;
1769 j_coord_offsetF = DIM*jnrF;
1770 j_coord_offsetG = DIM*jnrG;
1771 j_coord_offsetH = DIM*jnrH;
1773 /* load j atom coordinates */
1774 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1775 x+j_coord_offsetC,x+j_coord_offsetD,
1776 x+j_coord_offsetE,x+j_coord_offsetF,
1777 x+j_coord_offsetG,x+j_coord_offsetH,
1778 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1779 &jy2,&jz2,&jx3,&jy3,&jz3);
1781 /* Calculate displacement vector */
1782 dx00 = _mm256_sub_ps(ix0,jx0);
1783 dy00 = _mm256_sub_ps(iy0,jy0);
1784 dz00 = _mm256_sub_ps(iz0,jz0);
1785 dx11 = _mm256_sub_ps(ix1,jx1);
1786 dy11 = _mm256_sub_ps(iy1,jy1);
1787 dz11 = _mm256_sub_ps(iz1,jz1);
1788 dx12 = _mm256_sub_ps(ix1,jx2);
1789 dy12 = _mm256_sub_ps(iy1,jy2);
1790 dz12 = _mm256_sub_ps(iz1,jz2);
1791 dx13 = _mm256_sub_ps(ix1,jx3);
1792 dy13 = _mm256_sub_ps(iy1,jy3);
1793 dz13 = _mm256_sub_ps(iz1,jz3);
1794 dx21 = _mm256_sub_ps(ix2,jx1);
1795 dy21 = _mm256_sub_ps(iy2,jy1);
1796 dz21 = _mm256_sub_ps(iz2,jz1);
1797 dx22 = _mm256_sub_ps(ix2,jx2);
1798 dy22 = _mm256_sub_ps(iy2,jy2);
1799 dz22 = _mm256_sub_ps(iz2,jz2);
1800 dx23 = _mm256_sub_ps(ix2,jx3);
1801 dy23 = _mm256_sub_ps(iy2,jy3);
1802 dz23 = _mm256_sub_ps(iz2,jz3);
1803 dx31 = _mm256_sub_ps(ix3,jx1);
1804 dy31 = _mm256_sub_ps(iy3,jy1);
1805 dz31 = _mm256_sub_ps(iz3,jz1);
1806 dx32 = _mm256_sub_ps(ix3,jx2);
1807 dy32 = _mm256_sub_ps(iy3,jy2);
1808 dz32 = _mm256_sub_ps(iz3,jz2);
1809 dx33 = _mm256_sub_ps(ix3,jx3);
1810 dy33 = _mm256_sub_ps(iy3,jy3);
1811 dz33 = _mm256_sub_ps(iz3,jz3);
1813 /* Calculate squared distance and things based on it */
1814 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1815 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1816 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1817 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1818 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1819 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1820 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1821 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1822 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1823 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1825 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1826 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1827 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1828 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1829 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1830 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1831 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1832 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1833 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1834 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1836 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1837 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1838 rinvsq13 = _mm256_mul_ps(rinv13,rinv13);
1839 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1840 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1841 rinvsq23 = _mm256_mul_ps(rinv23,rinv23);
1842 rinvsq31 = _mm256_mul_ps(rinv31,rinv31);
1843 rinvsq32 = _mm256_mul_ps(rinv32,rinv32);
1844 rinvsq33 = _mm256_mul_ps(rinv33,rinv33);
1846 fjx0 = _mm256_setzero_ps();
1847 fjy0 = _mm256_setzero_ps();
1848 fjz0 = _mm256_setzero_ps();
1849 fjx1 = _mm256_setzero_ps();
1850 fjy1 = _mm256_setzero_ps();
1851 fjz1 = _mm256_setzero_ps();
1852 fjx2 = _mm256_setzero_ps();
1853 fjy2 = _mm256_setzero_ps();
1854 fjz2 = _mm256_setzero_ps();
1855 fjx3 = _mm256_setzero_ps();
1856 fjy3 = _mm256_setzero_ps();
1857 fjz3 = _mm256_setzero_ps();
1859 /**************************
1860 * CALCULATE INTERACTIONS *
1861 **************************/
1863 r00 = _mm256_mul_ps(rsq00,rinv00);
1864 r00 = _mm256_andnot_ps(dummy_mask,r00);
1866 /* Calculate table index by multiplying r with table scale and truncate to integer */
1867 rt = _mm256_mul_ps(r00,vftabscale);
1868 vfitab = _mm256_cvttps_epi32(rt);
1869 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1870 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1871 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1872 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1873 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
1874 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
1876 /* CUBIC SPLINE TABLE DISPERSION */
1877 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1878 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1879 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1880 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1881 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1882 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1883 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1884 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1885 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1886 Heps = _mm256_mul_ps(vfeps,H);
1887 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1888 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1889 fvdw6 = _mm256_mul_ps(c6_00,FF);
1891 /* CUBIC SPLINE TABLE REPULSION */
1892 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1893 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1894 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1895 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1896 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1897 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1898 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1899 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1900 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1901 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1902 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1903 Heps = _mm256_mul_ps(vfeps,H);
1904 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1905 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1906 fvdw12 = _mm256_mul_ps(c12_00,FF);
1907 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1911 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1913 /* Calculate temporary vectorial force */
1914 tx = _mm256_mul_ps(fscal,dx00);
1915 ty = _mm256_mul_ps(fscal,dy00);
1916 tz = _mm256_mul_ps(fscal,dz00);
1918 /* Update vectorial force */
1919 fix0 = _mm256_add_ps(fix0,tx);
1920 fiy0 = _mm256_add_ps(fiy0,ty);
1921 fiz0 = _mm256_add_ps(fiz0,tz);
1923 fjx0 = _mm256_add_ps(fjx0,tx);
1924 fjy0 = _mm256_add_ps(fjy0,ty);
1925 fjz0 = _mm256_add_ps(fjz0,tz);
1927 /**************************
1928 * CALCULATE INTERACTIONS *
1929 **************************/
1931 /* COULOMB ELECTROSTATICS */
1932 velec = _mm256_mul_ps(qq11,rinv11);
1933 felec = _mm256_mul_ps(velec,rinvsq11);
1937 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1939 /* Calculate temporary vectorial force */
1940 tx = _mm256_mul_ps(fscal,dx11);
1941 ty = _mm256_mul_ps(fscal,dy11);
1942 tz = _mm256_mul_ps(fscal,dz11);
1944 /* Update vectorial force */
1945 fix1 = _mm256_add_ps(fix1,tx);
1946 fiy1 = _mm256_add_ps(fiy1,ty);
1947 fiz1 = _mm256_add_ps(fiz1,tz);
1949 fjx1 = _mm256_add_ps(fjx1,tx);
1950 fjy1 = _mm256_add_ps(fjy1,ty);
1951 fjz1 = _mm256_add_ps(fjz1,tz);
1953 /**************************
1954 * CALCULATE INTERACTIONS *
1955 **************************/
1957 /* COULOMB ELECTROSTATICS */
1958 velec = _mm256_mul_ps(qq12,rinv12);
1959 felec = _mm256_mul_ps(velec,rinvsq12);
1963 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1965 /* Calculate temporary vectorial force */
1966 tx = _mm256_mul_ps(fscal,dx12);
1967 ty = _mm256_mul_ps(fscal,dy12);
1968 tz = _mm256_mul_ps(fscal,dz12);
1970 /* Update vectorial force */
1971 fix1 = _mm256_add_ps(fix1,tx);
1972 fiy1 = _mm256_add_ps(fiy1,ty);
1973 fiz1 = _mm256_add_ps(fiz1,tz);
1975 fjx2 = _mm256_add_ps(fjx2,tx);
1976 fjy2 = _mm256_add_ps(fjy2,ty);
1977 fjz2 = _mm256_add_ps(fjz2,tz);
1979 /**************************
1980 * CALCULATE INTERACTIONS *
1981 **************************/
1983 /* COULOMB ELECTROSTATICS */
1984 velec = _mm256_mul_ps(qq13,rinv13);
1985 felec = _mm256_mul_ps(velec,rinvsq13);
1989 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1991 /* Calculate temporary vectorial force */
1992 tx = _mm256_mul_ps(fscal,dx13);
1993 ty = _mm256_mul_ps(fscal,dy13);
1994 tz = _mm256_mul_ps(fscal,dz13);
1996 /* Update vectorial force */
1997 fix1 = _mm256_add_ps(fix1,tx);
1998 fiy1 = _mm256_add_ps(fiy1,ty);
1999 fiz1 = _mm256_add_ps(fiz1,tz);
2001 fjx3 = _mm256_add_ps(fjx3,tx);
2002 fjy3 = _mm256_add_ps(fjy3,ty);
2003 fjz3 = _mm256_add_ps(fjz3,tz);
2005 /**************************
2006 * CALCULATE INTERACTIONS *
2007 **************************/
2009 /* COULOMB ELECTROSTATICS */
2010 velec = _mm256_mul_ps(qq21,rinv21);
2011 felec = _mm256_mul_ps(velec,rinvsq21);
2015 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2017 /* Calculate temporary vectorial force */
2018 tx = _mm256_mul_ps(fscal,dx21);
2019 ty = _mm256_mul_ps(fscal,dy21);
2020 tz = _mm256_mul_ps(fscal,dz21);
2022 /* Update vectorial force */
2023 fix2 = _mm256_add_ps(fix2,tx);
2024 fiy2 = _mm256_add_ps(fiy2,ty);
2025 fiz2 = _mm256_add_ps(fiz2,tz);
2027 fjx1 = _mm256_add_ps(fjx1,tx);
2028 fjy1 = _mm256_add_ps(fjy1,ty);
2029 fjz1 = _mm256_add_ps(fjz1,tz);
2031 /**************************
2032 * CALCULATE INTERACTIONS *
2033 **************************/
2035 /* COULOMB ELECTROSTATICS */
2036 velec = _mm256_mul_ps(qq22,rinv22);
2037 felec = _mm256_mul_ps(velec,rinvsq22);
2041 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2043 /* Calculate temporary vectorial force */
2044 tx = _mm256_mul_ps(fscal,dx22);
2045 ty = _mm256_mul_ps(fscal,dy22);
2046 tz = _mm256_mul_ps(fscal,dz22);
2048 /* Update vectorial force */
2049 fix2 = _mm256_add_ps(fix2,tx);
2050 fiy2 = _mm256_add_ps(fiy2,ty);
2051 fiz2 = _mm256_add_ps(fiz2,tz);
2053 fjx2 = _mm256_add_ps(fjx2,tx);
2054 fjy2 = _mm256_add_ps(fjy2,ty);
2055 fjz2 = _mm256_add_ps(fjz2,tz);
2057 /**************************
2058 * CALCULATE INTERACTIONS *
2059 **************************/
2061 /* COULOMB ELECTROSTATICS */
2062 velec = _mm256_mul_ps(qq23,rinv23);
2063 felec = _mm256_mul_ps(velec,rinvsq23);
2067 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2069 /* Calculate temporary vectorial force */
2070 tx = _mm256_mul_ps(fscal,dx23);
2071 ty = _mm256_mul_ps(fscal,dy23);
2072 tz = _mm256_mul_ps(fscal,dz23);
2074 /* Update vectorial force */
2075 fix2 = _mm256_add_ps(fix2,tx);
2076 fiy2 = _mm256_add_ps(fiy2,ty);
2077 fiz2 = _mm256_add_ps(fiz2,tz);
2079 fjx3 = _mm256_add_ps(fjx3,tx);
2080 fjy3 = _mm256_add_ps(fjy3,ty);
2081 fjz3 = _mm256_add_ps(fjz3,tz);
2083 /**************************
2084 * CALCULATE INTERACTIONS *
2085 **************************/
2087 /* COULOMB ELECTROSTATICS */
2088 velec = _mm256_mul_ps(qq31,rinv31);
2089 felec = _mm256_mul_ps(velec,rinvsq31);
2093 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2095 /* Calculate temporary vectorial force */
2096 tx = _mm256_mul_ps(fscal,dx31);
2097 ty = _mm256_mul_ps(fscal,dy31);
2098 tz = _mm256_mul_ps(fscal,dz31);
2100 /* Update vectorial force */
2101 fix3 = _mm256_add_ps(fix3,tx);
2102 fiy3 = _mm256_add_ps(fiy3,ty);
2103 fiz3 = _mm256_add_ps(fiz3,tz);
2105 fjx1 = _mm256_add_ps(fjx1,tx);
2106 fjy1 = _mm256_add_ps(fjy1,ty);
2107 fjz1 = _mm256_add_ps(fjz1,tz);
2109 /**************************
2110 * CALCULATE INTERACTIONS *
2111 **************************/
2113 /* COULOMB ELECTROSTATICS */
2114 velec = _mm256_mul_ps(qq32,rinv32);
2115 felec = _mm256_mul_ps(velec,rinvsq32);
2119 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2121 /* Calculate temporary vectorial force */
2122 tx = _mm256_mul_ps(fscal,dx32);
2123 ty = _mm256_mul_ps(fscal,dy32);
2124 tz = _mm256_mul_ps(fscal,dz32);
2126 /* Update vectorial force */
2127 fix3 = _mm256_add_ps(fix3,tx);
2128 fiy3 = _mm256_add_ps(fiy3,ty);
2129 fiz3 = _mm256_add_ps(fiz3,tz);
2131 fjx2 = _mm256_add_ps(fjx2,tx);
2132 fjy2 = _mm256_add_ps(fjy2,ty);
2133 fjz2 = _mm256_add_ps(fjz2,tz);
2135 /**************************
2136 * CALCULATE INTERACTIONS *
2137 **************************/
2139 /* COULOMB ELECTROSTATICS */
2140 velec = _mm256_mul_ps(qq33,rinv33);
2141 felec = _mm256_mul_ps(velec,rinvsq33);
2145 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2147 /* Calculate temporary vectorial force */
2148 tx = _mm256_mul_ps(fscal,dx33);
2149 ty = _mm256_mul_ps(fscal,dy33);
2150 tz = _mm256_mul_ps(fscal,dz33);
2152 /* Update vectorial force */
2153 fix3 = _mm256_add_ps(fix3,tx);
2154 fiy3 = _mm256_add_ps(fiy3,ty);
2155 fiz3 = _mm256_add_ps(fiz3,tz);
2157 fjx3 = _mm256_add_ps(fjx3,tx);
2158 fjy3 = _mm256_add_ps(fjy3,ty);
2159 fjz3 = _mm256_add_ps(fjz3,tz);
2161 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2162 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2163 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2164 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2165 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2166 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2167 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2168 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2170 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2171 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2172 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2174 /* Inner loop uses 286 flops */
2177 /* End of innermost loop */
2179 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2180 f+i_coord_offset,fshift+i_shift_offset);
2182 /* Increment number of inner iterations */
2183 inneriter += j_index_end - j_index_start;
2185 /* Outer loop uses 24 flops */
2188 /* Increment number of outer iterations */
2191 /* Update outer/inner flops */
2193 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*286);