2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 real * vdwioffsetptr3;
79 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
80 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
81 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
82 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
83 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
84 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
85 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
86 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
87 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
88 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
89 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
90 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
91 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
92 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
93 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
94 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
95 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
96 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
97 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
98 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
101 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
104 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
105 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
107 __m128i vfitab_lo,vfitab_hi;
108 __m128i ifour = _mm_set1_epi32(4);
109 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
111 __m256 dummy_mask,cutoff_mask;
112 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
113 __m256 one = _mm256_set1_ps(1.0);
114 __m256 two = _mm256_set1_ps(2.0);
120 jindex = nlist->jindex;
122 shiftidx = nlist->shift;
124 shiftvec = fr->shift_vec[0];
125 fshift = fr->fshift[0];
126 facel = _mm256_set1_ps(fr->epsfac);
127 charge = mdatoms->chargeA;
128 nvdwtype = fr->ntype;
130 vdwtype = mdatoms->typeA;
132 vftab = kernel_data->table_elec->data;
133 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
135 /* Setup water-specific parameters */
136 inr = nlist->iinr[0];
137 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
138 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
139 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
140 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
142 jq1 = _mm256_set1_ps(charge[inr+1]);
143 jq2 = _mm256_set1_ps(charge[inr+2]);
144 jq3 = _mm256_set1_ps(charge[inr+3]);
145 vdwjidx0A = 2*vdwtype[inr+0];
146 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
147 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
148 qq11 = _mm256_mul_ps(iq1,jq1);
149 qq12 = _mm256_mul_ps(iq1,jq2);
150 qq13 = _mm256_mul_ps(iq1,jq3);
151 qq21 = _mm256_mul_ps(iq2,jq1);
152 qq22 = _mm256_mul_ps(iq2,jq2);
153 qq23 = _mm256_mul_ps(iq2,jq3);
154 qq31 = _mm256_mul_ps(iq3,jq1);
155 qq32 = _mm256_mul_ps(iq3,jq2);
156 qq33 = _mm256_mul_ps(iq3,jq3);
158 /* Avoid stupid compiler warnings */
159 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
172 for(iidx=0;iidx<4*DIM;iidx++)
177 /* Start outer loop over neighborlists */
178 for(iidx=0; iidx<nri; iidx++)
180 /* Load shift vector for this list */
181 i_shift_offset = DIM*shiftidx[iidx];
183 /* Load limits for loop over neighbors */
184 j_index_start = jindex[iidx];
185 j_index_end = jindex[iidx+1];
187 /* Get outer coordinate index */
189 i_coord_offset = DIM*inr;
191 /* Load i particle coords and add shift vector */
192 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
193 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
195 fix0 = _mm256_setzero_ps();
196 fiy0 = _mm256_setzero_ps();
197 fiz0 = _mm256_setzero_ps();
198 fix1 = _mm256_setzero_ps();
199 fiy1 = _mm256_setzero_ps();
200 fiz1 = _mm256_setzero_ps();
201 fix2 = _mm256_setzero_ps();
202 fiy2 = _mm256_setzero_ps();
203 fiz2 = _mm256_setzero_ps();
204 fix3 = _mm256_setzero_ps();
205 fiy3 = _mm256_setzero_ps();
206 fiz3 = _mm256_setzero_ps();
208 /* Reset potential sums */
209 velecsum = _mm256_setzero_ps();
210 vvdwsum = _mm256_setzero_ps();
212 /* Start inner kernel loop */
213 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
216 /* Get j neighbor index, and coordinate index */
225 j_coord_offsetA = DIM*jnrA;
226 j_coord_offsetB = DIM*jnrB;
227 j_coord_offsetC = DIM*jnrC;
228 j_coord_offsetD = DIM*jnrD;
229 j_coord_offsetE = DIM*jnrE;
230 j_coord_offsetF = DIM*jnrF;
231 j_coord_offsetG = DIM*jnrG;
232 j_coord_offsetH = DIM*jnrH;
234 /* load j atom coordinates */
235 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
236 x+j_coord_offsetC,x+j_coord_offsetD,
237 x+j_coord_offsetE,x+j_coord_offsetF,
238 x+j_coord_offsetG,x+j_coord_offsetH,
239 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
240 &jy2,&jz2,&jx3,&jy3,&jz3);
242 /* Calculate displacement vector */
243 dx00 = _mm256_sub_ps(ix0,jx0);
244 dy00 = _mm256_sub_ps(iy0,jy0);
245 dz00 = _mm256_sub_ps(iz0,jz0);
246 dx11 = _mm256_sub_ps(ix1,jx1);
247 dy11 = _mm256_sub_ps(iy1,jy1);
248 dz11 = _mm256_sub_ps(iz1,jz1);
249 dx12 = _mm256_sub_ps(ix1,jx2);
250 dy12 = _mm256_sub_ps(iy1,jy2);
251 dz12 = _mm256_sub_ps(iz1,jz2);
252 dx13 = _mm256_sub_ps(ix1,jx3);
253 dy13 = _mm256_sub_ps(iy1,jy3);
254 dz13 = _mm256_sub_ps(iz1,jz3);
255 dx21 = _mm256_sub_ps(ix2,jx1);
256 dy21 = _mm256_sub_ps(iy2,jy1);
257 dz21 = _mm256_sub_ps(iz2,jz1);
258 dx22 = _mm256_sub_ps(ix2,jx2);
259 dy22 = _mm256_sub_ps(iy2,jy2);
260 dz22 = _mm256_sub_ps(iz2,jz2);
261 dx23 = _mm256_sub_ps(ix2,jx3);
262 dy23 = _mm256_sub_ps(iy2,jy3);
263 dz23 = _mm256_sub_ps(iz2,jz3);
264 dx31 = _mm256_sub_ps(ix3,jx1);
265 dy31 = _mm256_sub_ps(iy3,jy1);
266 dz31 = _mm256_sub_ps(iz3,jz1);
267 dx32 = _mm256_sub_ps(ix3,jx2);
268 dy32 = _mm256_sub_ps(iy3,jy2);
269 dz32 = _mm256_sub_ps(iz3,jz2);
270 dx33 = _mm256_sub_ps(ix3,jx3);
271 dy33 = _mm256_sub_ps(iy3,jy3);
272 dz33 = _mm256_sub_ps(iz3,jz3);
274 /* Calculate squared distance and things based on it */
275 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
276 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
277 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
278 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
279 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
280 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
281 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
282 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
283 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
284 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
286 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
287 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
288 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
289 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
290 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
291 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
292 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
293 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
294 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
296 rinvsq00 = gmx_mm256_inv_ps(rsq00);
298 fjx0 = _mm256_setzero_ps();
299 fjy0 = _mm256_setzero_ps();
300 fjz0 = _mm256_setzero_ps();
301 fjx1 = _mm256_setzero_ps();
302 fjy1 = _mm256_setzero_ps();
303 fjz1 = _mm256_setzero_ps();
304 fjx2 = _mm256_setzero_ps();
305 fjy2 = _mm256_setzero_ps();
306 fjz2 = _mm256_setzero_ps();
307 fjx3 = _mm256_setzero_ps();
308 fjy3 = _mm256_setzero_ps();
309 fjz3 = _mm256_setzero_ps();
311 /**************************
312 * CALCULATE INTERACTIONS *
313 **************************/
315 /* LENNARD-JONES DISPERSION/REPULSION */
317 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
318 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
319 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
320 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
321 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
323 /* Update potential sum for this i atom from the interaction with this j atom. */
324 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
328 /* Calculate temporary vectorial force */
329 tx = _mm256_mul_ps(fscal,dx00);
330 ty = _mm256_mul_ps(fscal,dy00);
331 tz = _mm256_mul_ps(fscal,dz00);
333 /* Update vectorial force */
334 fix0 = _mm256_add_ps(fix0,tx);
335 fiy0 = _mm256_add_ps(fiy0,ty);
336 fiz0 = _mm256_add_ps(fiz0,tz);
338 fjx0 = _mm256_add_ps(fjx0,tx);
339 fjy0 = _mm256_add_ps(fjy0,ty);
340 fjz0 = _mm256_add_ps(fjz0,tz);
342 /**************************
343 * CALCULATE INTERACTIONS *
344 **************************/
346 r11 = _mm256_mul_ps(rsq11,rinv11);
348 /* Calculate table index by multiplying r with table scale and truncate to integer */
349 rt = _mm256_mul_ps(r11,vftabscale);
350 vfitab = _mm256_cvttps_epi32(rt);
351 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
352 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
353 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
354 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
355 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
356 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
358 /* CUBIC SPLINE TABLE ELECTROSTATICS */
359 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
360 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
361 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
362 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
363 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
364 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
365 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
366 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
367 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
368 Heps = _mm256_mul_ps(vfeps,H);
369 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
370 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
371 velec = _mm256_mul_ps(qq11,VV);
372 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
373 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
375 /* Update potential sum for this i atom from the interaction with this j atom. */
376 velecsum = _mm256_add_ps(velecsum,velec);
380 /* Calculate temporary vectorial force */
381 tx = _mm256_mul_ps(fscal,dx11);
382 ty = _mm256_mul_ps(fscal,dy11);
383 tz = _mm256_mul_ps(fscal,dz11);
385 /* Update vectorial force */
386 fix1 = _mm256_add_ps(fix1,tx);
387 fiy1 = _mm256_add_ps(fiy1,ty);
388 fiz1 = _mm256_add_ps(fiz1,tz);
390 fjx1 = _mm256_add_ps(fjx1,tx);
391 fjy1 = _mm256_add_ps(fjy1,ty);
392 fjz1 = _mm256_add_ps(fjz1,tz);
394 /**************************
395 * CALCULATE INTERACTIONS *
396 **************************/
398 r12 = _mm256_mul_ps(rsq12,rinv12);
400 /* Calculate table index by multiplying r with table scale and truncate to integer */
401 rt = _mm256_mul_ps(r12,vftabscale);
402 vfitab = _mm256_cvttps_epi32(rt);
403 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
404 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
405 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
406 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
407 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
408 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
410 /* CUBIC SPLINE TABLE ELECTROSTATICS */
411 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
412 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
413 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
414 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
415 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
416 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
417 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
418 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
419 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
420 Heps = _mm256_mul_ps(vfeps,H);
421 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
422 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
423 velec = _mm256_mul_ps(qq12,VV);
424 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
425 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
427 /* Update potential sum for this i atom from the interaction with this j atom. */
428 velecsum = _mm256_add_ps(velecsum,velec);
432 /* Calculate temporary vectorial force */
433 tx = _mm256_mul_ps(fscal,dx12);
434 ty = _mm256_mul_ps(fscal,dy12);
435 tz = _mm256_mul_ps(fscal,dz12);
437 /* Update vectorial force */
438 fix1 = _mm256_add_ps(fix1,tx);
439 fiy1 = _mm256_add_ps(fiy1,ty);
440 fiz1 = _mm256_add_ps(fiz1,tz);
442 fjx2 = _mm256_add_ps(fjx2,tx);
443 fjy2 = _mm256_add_ps(fjy2,ty);
444 fjz2 = _mm256_add_ps(fjz2,tz);
446 /**************************
447 * CALCULATE INTERACTIONS *
448 **************************/
450 r13 = _mm256_mul_ps(rsq13,rinv13);
452 /* Calculate table index by multiplying r with table scale and truncate to integer */
453 rt = _mm256_mul_ps(r13,vftabscale);
454 vfitab = _mm256_cvttps_epi32(rt);
455 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
456 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
457 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
458 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
459 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
460 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
462 /* CUBIC SPLINE TABLE ELECTROSTATICS */
463 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
464 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
465 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
466 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
467 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
468 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
469 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
470 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
471 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
472 Heps = _mm256_mul_ps(vfeps,H);
473 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
474 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
475 velec = _mm256_mul_ps(qq13,VV);
476 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
477 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
479 /* Update potential sum for this i atom from the interaction with this j atom. */
480 velecsum = _mm256_add_ps(velecsum,velec);
484 /* Calculate temporary vectorial force */
485 tx = _mm256_mul_ps(fscal,dx13);
486 ty = _mm256_mul_ps(fscal,dy13);
487 tz = _mm256_mul_ps(fscal,dz13);
489 /* Update vectorial force */
490 fix1 = _mm256_add_ps(fix1,tx);
491 fiy1 = _mm256_add_ps(fiy1,ty);
492 fiz1 = _mm256_add_ps(fiz1,tz);
494 fjx3 = _mm256_add_ps(fjx3,tx);
495 fjy3 = _mm256_add_ps(fjy3,ty);
496 fjz3 = _mm256_add_ps(fjz3,tz);
498 /**************************
499 * CALCULATE INTERACTIONS *
500 **************************/
502 r21 = _mm256_mul_ps(rsq21,rinv21);
504 /* Calculate table index by multiplying r with table scale and truncate to integer */
505 rt = _mm256_mul_ps(r21,vftabscale);
506 vfitab = _mm256_cvttps_epi32(rt);
507 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
508 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
509 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
510 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
511 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
512 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
514 /* CUBIC SPLINE TABLE ELECTROSTATICS */
515 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
516 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
517 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
518 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
519 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
520 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
521 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
522 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
523 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
524 Heps = _mm256_mul_ps(vfeps,H);
525 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
526 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
527 velec = _mm256_mul_ps(qq21,VV);
528 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
529 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
531 /* Update potential sum for this i atom from the interaction with this j atom. */
532 velecsum = _mm256_add_ps(velecsum,velec);
536 /* Calculate temporary vectorial force */
537 tx = _mm256_mul_ps(fscal,dx21);
538 ty = _mm256_mul_ps(fscal,dy21);
539 tz = _mm256_mul_ps(fscal,dz21);
541 /* Update vectorial force */
542 fix2 = _mm256_add_ps(fix2,tx);
543 fiy2 = _mm256_add_ps(fiy2,ty);
544 fiz2 = _mm256_add_ps(fiz2,tz);
546 fjx1 = _mm256_add_ps(fjx1,tx);
547 fjy1 = _mm256_add_ps(fjy1,ty);
548 fjz1 = _mm256_add_ps(fjz1,tz);
550 /**************************
551 * CALCULATE INTERACTIONS *
552 **************************/
554 r22 = _mm256_mul_ps(rsq22,rinv22);
556 /* Calculate table index by multiplying r with table scale and truncate to integer */
557 rt = _mm256_mul_ps(r22,vftabscale);
558 vfitab = _mm256_cvttps_epi32(rt);
559 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
560 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
561 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
562 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
563 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
564 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
566 /* CUBIC SPLINE TABLE ELECTROSTATICS */
567 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
568 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
569 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
570 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
571 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
572 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
573 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
574 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
575 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
576 Heps = _mm256_mul_ps(vfeps,H);
577 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
578 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
579 velec = _mm256_mul_ps(qq22,VV);
580 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
581 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
583 /* Update potential sum for this i atom from the interaction with this j atom. */
584 velecsum = _mm256_add_ps(velecsum,velec);
588 /* Calculate temporary vectorial force */
589 tx = _mm256_mul_ps(fscal,dx22);
590 ty = _mm256_mul_ps(fscal,dy22);
591 tz = _mm256_mul_ps(fscal,dz22);
593 /* Update vectorial force */
594 fix2 = _mm256_add_ps(fix2,tx);
595 fiy2 = _mm256_add_ps(fiy2,ty);
596 fiz2 = _mm256_add_ps(fiz2,tz);
598 fjx2 = _mm256_add_ps(fjx2,tx);
599 fjy2 = _mm256_add_ps(fjy2,ty);
600 fjz2 = _mm256_add_ps(fjz2,tz);
602 /**************************
603 * CALCULATE INTERACTIONS *
604 **************************/
606 r23 = _mm256_mul_ps(rsq23,rinv23);
608 /* Calculate table index by multiplying r with table scale and truncate to integer */
609 rt = _mm256_mul_ps(r23,vftabscale);
610 vfitab = _mm256_cvttps_epi32(rt);
611 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
612 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
613 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
614 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
615 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
616 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
618 /* CUBIC SPLINE TABLE ELECTROSTATICS */
619 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
620 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
621 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
622 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
623 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
624 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
625 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
626 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
627 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
628 Heps = _mm256_mul_ps(vfeps,H);
629 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
630 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
631 velec = _mm256_mul_ps(qq23,VV);
632 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
633 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
635 /* Update potential sum for this i atom from the interaction with this j atom. */
636 velecsum = _mm256_add_ps(velecsum,velec);
640 /* Calculate temporary vectorial force */
641 tx = _mm256_mul_ps(fscal,dx23);
642 ty = _mm256_mul_ps(fscal,dy23);
643 tz = _mm256_mul_ps(fscal,dz23);
645 /* Update vectorial force */
646 fix2 = _mm256_add_ps(fix2,tx);
647 fiy2 = _mm256_add_ps(fiy2,ty);
648 fiz2 = _mm256_add_ps(fiz2,tz);
650 fjx3 = _mm256_add_ps(fjx3,tx);
651 fjy3 = _mm256_add_ps(fjy3,ty);
652 fjz3 = _mm256_add_ps(fjz3,tz);
654 /**************************
655 * CALCULATE INTERACTIONS *
656 **************************/
658 r31 = _mm256_mul_ps(rsq31,rinv31);
660 /* Calculate table index by multiplying r with table scale and truncate to integer */
661 rt = _mm256_mul_ps(r31,vftabscale);
662 vfitab = _mm256_cvttps_epi32(rt);
663 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
664 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
665 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
666 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
667 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
668 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
670 /* CUBIC SPLINE TABLE ELECTROSTATICS */
671 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
672 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
673 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
674 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
675 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
676 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
677 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
678 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
679 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
680 Heps = _mm256_mul_ps(vfeps,H);
681 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
682 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
683 velec = _mm256_mul_ps(qq31,VV);
684 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
685 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
687 /* Update potential sum for this i atom from the interaction with this j atom. */
688 velecsum = _mm256_add_ps(velecsum,velec);
692 /* Calculate temporary vectorial force */
693 tx = _mm256_mul_ps(fscal,dx31);
694 ty = _mm256_mul_ps(fscal,dy31);
695 tz = _mm256_mul_ps(fscal,dz31);
697 /* Update vectorial force */
698 fix3 = _mm256_add_ps(fix3,tx);
699 fiy3 = _mm256_add_ps(fiy3,ty);
700 fiz3 = _mm256_add_ps(fiz3,tz);
702 fjx1 = _mm256_add_ps(fjx1,tx);
703 fjy1 = _mm256_add_ps(fjy1,ty);
704 fjz1 = _mm256_add_ps(fjz1,tz);
706 /**************************
707 * CALCULATE INTERACTIONS *
708 **************************/
710 r32 = _mm256_mul_ps(rsq32,rinv32);
712 /* Calculate table index by multiplying r with table scale and truncate to integer */
713 rt = _mm256_mul_ps(r32,vftabscale);
714 vfitab = _mm256_cvttps_epi32(rt);
715 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
716 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
717 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
718 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
719 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
720 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
722 /* CUBIC SPLINE TABLE ELECTROSTATICS */
723 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
724 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
725 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
726 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
727 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
728 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
729 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
730 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
731 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
732 Heps = _mm256_mul_ps(vfeps,H);
733 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
734 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
735 velec = _mm256_mul_ps(qq32,VV);
736 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
737 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
739 /* Update potential sum for this i atom from the interaction with this j atom. */
740 velecsum = _mm256_add_ps(velecsum,velec);
744 /* Calculate temporary vectorial force */
745 tx = _mm256_mul_ps(fscal,dx32);
746 ty = _mm256_mul_ps(fscal,dy32);
747 tz = _mm256_mul_ps(fscal,dz32);
749 /* Update vectorial force */
750 fix3 = _mm256_add_ps(fix3,tx);
751 fiy3 = _mm256_add_ps(fiy3,ty);
752 fiz3 = _mm256_add_ps(fiz3,tz);
754 fjx2 = _mm256_add_ps(fjx2,tx);
755 fjy2 = _mm256_add_ps(fjy2,ty);
756 fjz2 = _mm256_add_ps(fjz2,tz);
758 /**************************
759 * CALCULATE INTERACTIONS *
760 **************************/
762 r33 = _mm256_mul_ps(rsq33,rinv33);
764 /* Calculate table index by multiplying r with table scale and truncate to integer */
765 rt = _mm256_mul_ps(r33,vftabscale);
766 vfitab = _mm256_cvttps_epi32(rt);
767 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
768 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
769 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
770 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
771 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
772 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
774 /* CUBIC SPLINE TABLE ELECTROSTATICS */
775 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
776 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
777 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
778 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
779 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
780 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
781 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
782 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
783 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
784 Heps = _mm256_mul_ps(vfeps,H);
785 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
786 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
787 velec = _mm256_mul_ps(qq33,VV);
788 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
789 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
791 /* Update potential sum for this i atom from the interaction with this j atom. */
792 velecsum = _mm256_add_ps(velecsum,velec);
796 /* Calculate temporary vectorial force */
797 tx = _mm256_mul_ps(fscal,dx33);
798 ty = _mm256_mul_ps(fscal,dy33);
799 tz = _mm256_mul_ps(fscal,dz33);
801 /* Update vectorial force */
802 fix3 = _mm256_add_ps(fix3,tx);
803 fiy3 = _mm256_add_ps(fiy3,ty);
804 fiz3 = _mm256_add_ps(fiz3,tz);
806 fjx3 = _mm256_add_ps(fjx3,tx);
807 fjy3 = _mm256_add_ps(fjy3,ty);
808 fjz3 = _mm256_add_ps(fjz3,tz);
810 fjptrA = f+j_coord_offsetA;
811 fjptrB = f+j_coord_offsetB;
812 fjptrC = f+j_coord_offsetC;
813 fjptrD = f+j_coord_offsetD;
814 fjptrE = f+j_coord_offsetE;
815 fjptrF = f+j_coord_offsetF;
816 fjptrG = f+j_coord_offsetG;
817 fjptrH = f+j_coord_offsetH;
819 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
820 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
821 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
823 /* Inner loop uses 422 flops */
829 /* Get j neighbor index, and coordinate index */
830 jnrlistA = jjnr[jidx];
831 jnrlistB = jjnr[jidx+1];
832 jnrlistC = jjnr[jidx+2];
833 jnrlistD = jjnr[jidx+3];
834 jnrlistE = jjnr[jidx+4];
835 jnrlistF = jjnr[jidx+5];
836 jnrlistG = jjnr[jidx+6];
837 jnrlistH = jjnr[jidx+7];
838 /* Sign of each element will be negative for non-real atoms.
839 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
840 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
842 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
843 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
845 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
846 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
847 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
848 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
849 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
850 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
851 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
852 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
853 j_coord_offsetA = DIM*jnrA;
854 j_coord_offsetB = DIM*jnrB;
855 j_coord_offsetC = DIM*jnrC;
856 j_coord_offsetD = DIM*jnrD;
857 j_coord_offsetE = DIM*jnrE;
858 j_coord_offsetF = DIM*jnrF;
859 j_coord_offsetG = DIM*jnrG;
860 j_coord_offsetH = DIM*jnrH;
862 /* load j atom coordinates */
863 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
864 x+j_coord_offsetC,x+j_coord_offsetD,
865 x+j_coord_offsetE,x+j_coord_offsetF,
866 x+j_coord_offsetG,x+j_coord_offsetH,
867 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
868 &jy2,&jz2,&jx3,&jy3,&jz3);
870 /* Calculate displacement vector */
871 dx00 = _mm256_sub_ps(ix0,jx0);
872 dy00 = _mm256_sub_ps(iy0,jy0);
873 dz00 = _mm256_sub_ps(iz0,jz0);
874 dx11 = _mm256_sub_ps(ix1,jx1);
875 dy11 = _mm256_sub_ps(iy1,jy1);
876 dz11 = _mm256_sub_ps(iz1,jz1);
877 dx12 = _mm256_sub_ps(ix1,jx2);
878 dy12 = _mm256_sub_ps(iy1,jy2);
879 dz12 = _mm256_sub_ps(iz1,jz2);
880 dx13 = _mm256_sub_ps(ix1,jx3);
881 dy13 = _mm256_sub_ps(iy1,jy3);
882 dz13 = _mm256_sub_ps(iz1,jz3);
883 dx21 = _mm256_sub_ps(ix2,jx1);
884 dy21 = _mm256_sub_ps(iy2,jy1);
885 dz21 = _mm256_sub_ps(iz2,jz1);
886 dx22 = _mm256_sub_ps(ix2,jx2);
887 dy22 = _mm256_sub_ps(iy2,jy2);
888 dz22 = _mm256_sub_ps(iz2,jz2);
889 dx23 = _mm256_sub_ps(ix2,jx3);
890 dy23 = _mm256_sub_ps(iy2,jy3);
891 dz23 = _mm256_sub_ps(iz2,jz3);
892 dx31 = _mm256_sub_ps(ix3,jx1);
893 dy31 = _mm256_sub_ps(iy3,jy1);
894 dz31 = _mm256_sub_ps(iz3,jz1);
895 dx32 = _mm256_sub_ps(ix3,jx2);
896 dy32 = _mm256_sub_ps(iy3,jy2);
897 dz32 = _mm256_sub_ps(iz3,jz2);
898 dx33 = _mm256_sub_ps(ix3,jx3);
899 dy33 = _mm256_sub_ps(iy3,jy3);
900 dz33 = _mm256_sub_ps(iz3,jz3);
902 /* Calculate squared distance and things based on it */
903 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
904 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
905 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
906 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
907 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
908 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
909 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
910 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
911 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
912 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
914 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
915 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
916 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
917 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
918 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
919 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
920 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
921 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
922 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
924 rinvsq00 = gmx_mm256_inv_ps(rsq00);
926 fjx0 = _mm256_setzero_ps();
927 fjy0 = _mm256_setzero_ps();
928 fjz0 = _mm256_setzero_ps();
929 fjx1 = _mm256_setzero_ps();
930 fjy1 = _mm256_setzero_ps();
931 fjz1 = _mm256_setzero_ps();
932 fjx2 = _mm256_setzero_ps();
933 fjy2 = _mm256_setzero_ps();
934 fjz2 = _mm256_setzero_ps();
935 fjx3 = _mm256_setzero_ps();
936 fjy3 = _mm256_setzero_ps();
937 fjz3 = _mm256_setzero_ps();
939 /**************************
940 * CALCULATE INTERACTIONS *
941 **************************/
943 /* LENNARD-JONES DISPERSION/REPULSION */
945 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
946 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
947 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
948 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
949 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
951 /* Update potential sum for this i atom from the interaction with this j atom. */
952 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
953 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
957 fscal = _mm256_andnot_ps(dummy_mask,fscal);
959 /* Calculate temporary vectorial force */
960 tx = _mm256_mul_ps(fscal,dx00);
961 ty = _mm256_mul_ps(fscal,dy00);
962 tz = _mm256_mul_ps(fscal,dz00);
964 /* Update vectorial force */
965 fix0 = _mm256_add_ps(fix0,tx);
966 fiy0 = _mm256_add_ps(fiy0,ty);
967 fiz0 = _mm256_add_ps(fiz0,tz);
969 fjx0 = _mm256_add_ps(fjx0,tx);
970 fjy0 = _mm256_add_ps(fjy0,ty);
971 fjz0 = _mm256_add_ps(fjz0,tz);
973 /**************************
974 * CALCULATE INTERACTIONS *
975 **************************/
977 r11 = _mm256_mul_ps(rsq11,rinv11);
978 r11 = _mm256_andnot_ps(dummy_mask,r11);
980 /* Calculate table index by multiplying r with table scale and truncate to integer */
981 rt = _mm256_mul_ps(r11,vftabscale);
982 vfitab = _mm256_cvttps_epi32(rt);
983 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
984 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
985 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
986 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
987 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
988 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
990 /* CUBIC SPLINE TABLE ELECTROSTATICS */
991 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
992 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
993 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
994 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
995 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
996 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
997 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
998 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
999 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1000 Heps = _mm256_mul_ps(vfeps,H);
1001 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1002 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1003 velec = _mm256_mul_ps(qq11,VV);
1004 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1005 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1007 /* Update potential sum for this i atom from the interaction with this j atom. */
1008 velec = _mm256_andnot_ps(dummy_mask,velec);
1009 velecsum = _mm256_add_ps(velecsum,velec);
1013 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1015 /* Calculate temporary vectorial force */
1016 tx = _mm256_mul_ps(fscal,dx11);
1017 ty = _mm256_mul_ps(fscal,dy11);
1018 tz = _mm256_mul_ps(fscal,dz11);
1020 /* Update vectorial force */
1021 fix1 = _mm256_add_ps(fix1,tx);
1022 fiy1 = _mm256_add_ps(fiy1,ty);
1023 fiz1 = _mm256_add_ps(fiz1,tz);
1025 fjx1 = _mm256_add_ps(fjx1,tx);
1026 fjy1 = _mm256_add_ps(fjy1,ty);
1027 fjz1 = _mm256_add_ps(fjz1,tz);
1029 /**************************
1030 * CALCULATE INTERACTIONS *
1031 **************************/
1033 r12 = _mm256_mul_ps(rsq12,rinv12);
1034 r12 = _mm256_andnot_ps(dummy_mask,r12);
1036 /* Calculate table index by multiplying r with table scale and truncate to integer */
1037 rt = _mm256_mul_ps(r12,vftabscale);
1038 vfitab = _mm256_cvttps_epi32(rt);
1039 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1040 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1041 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1042 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1043 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1044 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1046 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1047 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1048 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1049 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1050 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1051 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1052 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1053 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1054 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1055 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1056 Heps = _mm256_mul_ps(vfeps,H);
1057 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1058 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1059 velec = _mm256_mul_ps(qq12,VV);
1060 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1061 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1063 /* Update potential sum for this i atom from the interaction with this j atom. */
1064 velec = _mm256_andnot_ps(dummy_mask,velec);
1065 velecsum = _mm256_add_ps(velecsum,velec);
1069 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1071 /* Calculate temporary vectorial force */
1072 tx = _mm256_mul_ps(fscal,dx12);
1073 ty = _mm256_mul_ps(fscal,dy12);
1074 tz = _mm256_mul_ps(fscal,dz12);
1076 /* Update vectorial force */
1077 fix1 = _mm256_add_ps(fix1,tx);
1078 fiy1 = _mm256_add_ps(fiy1,ty);
1079 fiz1 = _mm256_add_ps(fiz1,tz);
1081 fjx2 = _mm256_add_ps(fjx2,tx);
1082 fjy2 = _mm256_add_ps(fjy2,ty);
1083 fjz2 = _mm256_add_ps(fjz2,tz);
1085 /**************************
1086 * CALCULATE INTERACTIONS *
1087 **************************/
1089 r13 = _mm256_mul_ps(rsq13,rinv13);
1090 r13 = _mm256_andnot_ps(dummy_mask,r13);
1092 /* Calculate table index by multiplying r with table scale and truncate to integer */
1093 rt = _mm256_mul_ps(r13,vftabscale);
1094 vfitab = _mm256_cvttps_epi32(rt);
1095 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1096 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1097 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1098 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1099 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1100 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1102 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1103 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1104 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1105 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1106 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1107 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1108 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1109 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1110 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1111 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1112 Heps = _mm256_mul_ps(vfeps,H);
1113 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1114 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1115 velec = _mm256_mul_ps(qq13,VV);
1116 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1117 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
1119 /* Update potential sum for this i atom from the interaction with this j atom. */
1120 velec = _mm256_andnot_ps(dummy_mask,velec);
1121 velecsum = _mm256_add_ps(velecsum,velec);
1125 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1127 /* Calculate temporary vectorial force */
1128 tx = _mm256_mul_ps(fscal,dx13);
1129 ty = _mm256_mul_ps(fscal,dy13);
1130 tz = _mm256_mul_ps(fscal,dz13);
1132 /* Update vectorial force */
1133 fix1 = _mm256_add_ps(fix1,tx);
1134 fiy1 = _mm256_add_ps(fiy1,ty);
1135 fiz1 = _mm256_add_ps(fiz1,tz);
1137 fjx3 = _mm256_add_ps(fjx3,tx);
1138 fjy3 = _mm256_add_ps(fjy3,ty);
1139 fjz3 = _mm256_add_ps(fjz3,tz);
1141 /**************************
1142 * CALCULATE INTERACTIONS *
1143 **************************/
1145 r21 = _mm256_mul_ps(rsq21,rinv21);
1146 r21 = _mm256_andnot_ps(dummy_mask,r21);
1148 /* Calculate table index by multiplying r with table scale and truncate to integer */
1149 rt = _mm256_mul_ps(r21,vftabscale);
1150 vfitab = _mm256_cvttps_epi32(rt);
1151 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1152 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1153 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1154 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1155 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1156 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1158 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1159 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1160 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1161 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1162 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1163 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1164 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1165 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1166 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1167 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1168 Heps = _mm256_mul_ps(vfeps,H);
1169 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1170 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1171 velec = _mm256_mul_ps(qq21,VV);
1172 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1173 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1175 /* Update potential sum for this i atom from the interaction with this j atom. */
1176 velec = _mm256_andnot_ps(dummy_mask,velec);
1177 velecsum = _mm256_add_ps(velecsum,velec);
1181 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1183 /* Calculate temporary vectorial force */
1184 tx = _mm256_mul_ps(fscal,dx21);
1185 ty = _mm256_mul_ps(fscal,dy21);
1186 tz = _mm256_mul_ps(fscal,dz21);
1188 /* Update vectorial force */
1189 fix2 = _mm256_add_ps(fix2,tx);
1190 fiy2 = _mm256_add_ps(fiy2,ty);
1191 fiz2 = _mm256_add_ps(fiz2,tz);
1193 fjx1 = _mm256_add_ps(fjx1,tx);
1194 fjy1 = _mm256_add_ps(fjy1,ty);
1195 fjz1 = _mm256_add_ps(fjz1,tz);
1197 /**************************
1198 * CALCULATE INTERACTIONS *
1199 **************************/
1201 r22 = _mm256_mul_ps(rsq22,rinv22);
1202 r22 = _mm256_andnot_ps(dummy_mask,r22);
1204 /* Calculate table index by multiplying r with table scale and truncate to integer */
1205 rt = _mm256_mul_ps(r22,vftabscale);
1206 vfitab = _mm256_cvttps_epi32(rt);
1207 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1208 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1209 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1210 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1211 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1212 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1214 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1215 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1216 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1217 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1218 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1219 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1220 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1221 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1222 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1223 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1224 Heps = _mm256_mul_ps(vfeps,H);
1225 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1226 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1227 velec = _mm256_mul_ps(qq22,VV);
1228 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1229 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1231 /* Update potential sum for this i atom from the interaction with this j atom. */
1232 velec = _mm256_andnot_ps(dummy_mask,velec);
1233 velecsum = _mm256_add_ps(velecsum,velec);
1237 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1239 /* Calculate temporary vectorial force */
1240 tx = _mm256_mul_ps(fscal,dx22);
1241 ty = _mm256_mul_ps(fscal,dy22);
1242 tz = _mm256_mul_ps(fscal,dz22);
1244 /* Update vectorial force */
1245 fix2 = _mm256_add_ps(fix2,tx);
1246 fiy2 = _mm256_add_ps(fiy2,ty);
1247 fiz2 = _mm256_add_ps(fiz2,tz);
1249 fjx2 = _mm256_add_ps(fjx2,tx);
1250 fjy2 = _mm256_add_ps(fjy2,ty);
1251 fjz2 = _mm256_add_ps(fjz2,tz);
1253 /**************************
1254 * CALCULATE INTERACTIONS *
1255 **************************/
1257 r23 = _mm256_mul_ps(rsq23,rinv23);
1258 r23 = _mm256_andnot_ps(dummy_mask,r23);
1260 /* Calculate table index by multiplying r with table scale and truncate to integer */
1261 rt = _mm256_mul_ps(r23,vftabscale);
1262 vfitab = _mm256_cvttps_epi32(rt);
1263 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1264 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1265 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1266 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1267 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1268 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1270 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1271 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1272 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1273 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1274 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1275 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1276 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1277 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1278 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1279 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1280 Heps = _mm256_mul_ps(vfeps,H);
1281 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1282 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1283 velec = _mm256_mul_ps(qq23,VV);
1284 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1285 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
1287 /* Update potential sum for this i atom from the interaction with this j atom. */
1288 velec = _mm256_andnot_ps(dummy_mask,velec);
1289 velecsum = _mm256_add_ps(velecsum,velec);
1293 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1295 /* Calculate temporary vectorial force */
1296 tx = _mm256_mul_ps(fscal,dx23);
1297 ty = _mm256_mul_ps(fscal,dy23);
1298 tz = _mm256_mul_ps(fscal,dz23);
1300 /* Update vectorial force */
1301 fix2 = _mm256_add_ps(fix2,tx);
1302 fiy2 = _mm256_add_ps(fiy2,ty);
1303 fiz2 = _mm256_add_ps(fiz2,tz);
1305 fjx3 = _mm256_add_ps(fjx3,tx);
1306 fjy3 = _mm256_add_ps(fjy3,ty);
1307 fjz3 = _mm256_add_ps(fjz3,tz);
1309 /**************************
1310 * CALCULATE INTERACTIONS *
1311 **************************/
1313 r31 = _mm256_mul_ps(rsq31,rinv31);
1314 r31 = _mm256_andnot_ps(dummy_mask,r31);
1316 /* Calculate table index by multiplying r with table scale and truncate to integer */
1317 rt = _mm256_mul_ps(r31,vftabscale);
1318 vfitab = _mm256_cvttps_epi32(rt);
1319 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1320 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1321 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1322 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1323 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1324 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1326 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1327 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1328 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1329 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1330 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1331 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1332 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1333 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1334 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1335 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1336 Heps = _mm256_mul_ps(vfeps,H);
1337 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1338 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1339 velec = _mm256_mul_ps(qq31,VV);
1340 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1341 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
1343 /* Update potential sum for this i atom from the interaction with this j atom. */
1344 velec = _mm256_andnot_ps(dummy_mask,velec);
1345 velecsum = _mm256_add_ps(velecsum,velec);
1349 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1351 /* Calculate temporary vectorial force */
1352 tx = _mm256_mul_ps(fscal,dx31);
1353 ty = _mm256_mul_ps(fscal,dy31);
1354 tz = _mm256_mul_ps(fscal,dz31);
1356 /* Update vectorial force */
1357 fix3 = _mm256_add_ps(fix3,tx);
1358 fiy3 = _mm256_add_ps(fiy3,ty);
1359 fiz3 = _mm256_add_ps(fiz3,tz);
1361 fjx1 = _mm256_add_ps(fjx1,tx);
1362 fjy1 = _mm256_add_ps(fjy1,ty);
1363 fjz1 = _mm256_add_ps(fjz1,tz);
1365 /**************************
1366 * CALCULATE INTERACTIONS *
1367 **************************/
1369 r32 = _mm256_mul_ps(rsq32,rinv32);
1370 r32 = _mm256_andnot_ps(dummy_mask,r32);
1372 /* Calculate table index by multiplying r with table scale and truncate to integer */
1373 rt = _mm256_mul_ps(r32,vftabscale);
1374 vfitab = _mm256_cvttps_epi32(rt);
1375 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1376 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1377 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1378 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1379 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1380 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1382 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1383 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1384 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1385 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1386 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1387 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1388 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1389 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1390 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1391 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1392 Heps = _mm256_mul_ps(vfeps,H);
1393 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1394 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1395 velec = _mm256_mul_ps(qq32,VV);
1396 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1397 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
1399 /* Update potential sum for this i atom from the interaction with this j atom. */
1400 velec = _mm256_andnot_ps(dummy_mask,velec);
1401 velecsum = _mm256_add_ps(velecsum,velec);
1405 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1407 /* Calculate temporary vectorial force */
1408 tx = _mm256_mul_ps(fscal,dx32);
1409 ty = _mm256_mul_ps(fscal,dy32);
1410 tz = _mm256_mul_ps(fscal,dz32);
1412 /* Update vectorial force */
1413 fix3 = _mm256_add_ps(fix3,tx);
1414 fiy3 = _mm256_add_ps(fiy3,ty);
1415 fiz3 = _mm256_add_ps(fiz3,tz);
1417 fjx2 = _mm256_add_ps(fjx2,tx);
1418 fjy2 = _mm256_add_ps(fjy2,ty);
1419 fjz2 = _mm256_add_ps(fjz2,tz);
1421 /**************************
1422 * CALCULATE INTERACTIONS *
1423 **************************/
1425 r33 = _mm256_mul_ps(rsq33,rinv33);
1426 r33 = _mm256_andnot_ps(dummy_mask,r33);
1428 /* Calculate table index by multiplying r with table scale and truncate to integer */
1429 rt = _mm256_mul_ps(r33,vftabscale);
1430 vfitab = _mm256_cvttps_epi32(rt);
1431 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1432 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1433 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1434 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1435 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1436 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1438 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1439 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1440 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1441 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1442 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1443 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1444 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1445 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1446 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1447 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1448 Heps = _mm256_mul_ps(vfeps,H);
1449 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1450 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1451 velec = _mm256_mul_ps(qq33,VV);
1452 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1453 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
1455 /* Update potential sum for this i atom from the interaction with this j atom. */
1456 velec = _mm256_andnot_ps(dummy_mask,velec);
1457 velecsum = _mm256_add_ps(velecsum,velec);
1461 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1463 /* Calculate temporary vectorial force */
1464 tx = _mm256_mul_ps(fscal,dx33);
1465 ty = _mm256_mul_ps(fscal,dy33);
1466 tz = _mm256_mul_ps(fscal,dz33);
1468 /* Update vectorial force */
1469 fix3 = _mm256_add_ps(fix3,tx);
1470 fiy3 = _mm256_add_ps(fiy3,ty);
1471 fiz3 = _mm256_add_ps(fiz3,tz);
1473 fjx3 = _mm256_add_ps(fjx3,tx);
1474 fjy3 = _mm256_add_ps(fjy3,ty);
1475 fjz3 = _mm256_add_ps(fjz3,tz);
1477 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1478 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1479 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1480 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1481 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1482 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1483 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1484 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1486 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1487 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1488 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1490 /* Inner loop uses 431 flops */
1493 /* End of innermost loop */
1495 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1496 f+i_coord_offset,fshift+i_shift_offset);
1499 /* Update potential energies */
1500 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1501 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1503 /* Increment number of inner iterations */
1504 inneriter += j_index_end - j_index_start;
1506 /* Outer loop uses 26 flops */
1509 /* Increment number of outer iterations */
1512 /* Update outer/inner flops */
1514 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*431);
1517 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_single
1518 * Electrostatics interaction: CubicSplineTable
1519 * VdW interaction: LennardJones
1520 * Geometry: Water4-Water4
1521 * Calculate force/pot: Force
1524 nb_kernel_ElecCSTab_VdwLJ_GeomW4W4_F_avx_256_single
1525 (t_nblist * gmx_restrict nlist,
1526 rvec * gmx_restrict xx,
1527 rvec * gmx_restrict ff,
1528 t_forcerec * gmx_restrict fr,
1529 t_mdatoms * gmx_restrict mdatoms,
1530 nb_kernel_data_t * gmx_restrict kernel_data,
1531 t_nrnb * gmx_restrict nrnb)
1533 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1534 * just 0 for non-waters.
1535 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1536 * jnr indices corresponding to data put in the four positions in the SIMD register.
1538 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1539 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1540 int jnrA,jnrB,jnrC,jnrD;
1541 int jnrE,jnrF,jnrG,jnrH;
1542 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1543 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1544 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1545 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1546 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1547 real rcutoff_scalar;
1548 real *shiftvec,*fshift,*x,*f;
1549 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1550 real scratch[4*DIM];
1551 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1552 real * vdwioffsetptr0;
1553 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1554 real * vdwioffsetptr1;
1555 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1556 real * vdwioffsetptr2;
1557 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1558 real * vdwioffsetptr3;
1559 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1560 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1561 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1562 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1563 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1564 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1565 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1566 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1567 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1568 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1569 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1570 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1571 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1572 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1573 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1574 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1575 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1576 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1577 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1578 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1581 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1584 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1585 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1587 __m128i vfitab_lo,vfitab_hi;
1588 __m128i ifour = _mm_set1_epi32(4);
1589 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1591 __m256 dummy_mask,cutoff_mask;
1592 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1593 __m256 one = _mm256_set1_ps(1.0);
1594 __m256 two = _mm256_set1_ps(2.0);
1600 jindex = nlist->jindex;
1602 shiftidx = nlist->shift;
1604 shiftvec = fr->shift_vec[0];
1605 fshift = fr->fshift[0];
1606 facel = _mm256_set1_ps(fr->epsfac);
1607 charge = mdatoms->chargeA;
1608 nvdwtype = fr->ntype;
1609 vdwparam = fr->nbfp;
1610 vdwtype = mdatoms->typeA;
1612 vftab = kernel_data->table_elec->data;
1613 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
1615 /* Setup water-specific parameters */
1616 inr = nlist->iinr[0];
1617 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1618 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1619 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1620 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1622 jq1 = _mm256_set1_ps(charge[inr+1]);
1623 jq2 = _mm256_set1_ps(charge[inr+2]);
1624 jq3 = _mm256_set1_ps(charge[inr+3]);
1625 vdwjidx0A = 2*vdwtype[inr+0];
1626 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1627 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1628 qq11 = _mm256_mul_ps(iq1,jq1);
1629 qq12 = _mm256_mul_ps(iq1,jq2);
1630 qq13 = _mm256_mul_ps(iq1,jq3);
1631 qq21 = _mm256_mul_ps(iq2,jq1);
1632 qq22 = _mm256_mul_ps(iq2,jq2);
1633 qq23 = _mm256_mul_ps(iq2,jq3);
1634 qq31 = _mm256_mul_ps(iq3,jq1);
1635 qq32 = _mm256_mul_ps(iq3,jq2);
1636 qq33 = _mm256_mul_ps(iq3,jq3);
1638 /* Avoid stupid compiler warnings */
1639 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1640 j_coord_offsetA = 0;
1641 j_coord_offsetB = 0;
1642 j_coord_offsetC = 0;
1643 j_coord_offsetD = 0;
1644 j_coord_offsetE = 0;
1645 j_coord_offsetF = 0;
1646 j_coord_offsetG = 0;
1647 j_coord_offsetH = 0;
1652 for(iidx=0;iidx<4*DIM;iidx++)
1654 scratch[iidx] = 0.0;
1657 /* Start outer loop over neighborlists */
1658 for(iidx=0; iidx<nri; iidx++)
1660 /* Load shift vector for this list */
1661 i_shift_offset = DIM*shiftidx[iidx];
1663 /* Load limits for loop over neighbors */
1664 j_index_start = jindex[iidx];
1665 j_index_end = jindex[iidx+1];
1667 /* Get outer coordinate index */
1669 i_coord_offset = DIM*inr;
1671 /* Load i particle coords and add shift vector */
1672 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1673 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1675 fix0 = _mm256_setzero_ps();
1676 fiy0 = _mm256_setzero_ps();
1677 fiz0 = _mm256_setzero_ps();
1678 fix1 = _mm256_setzero_ps();
1679 fiy1 = _mm256_setzero_ps();
1680 fiz1 = _mm256_setzero_ps();
1681 fix2 = _mm256_setzero_ps();
1682 fiy2 = _mm256_setzero_ps();
1683 fiz2 = _mm256_setzero_ps();
1684 fix3 = _mm256_setzero_ps();
1685 fiy3 = _mm256_setzero_ps();
1686 fiz3 = _mm256_setzero_ps();
1688 /* Start inner kernel loop */
1689 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1692 /* Get j neighbor index, and coordinate index */
1694 jnrB = jjnr[jidx+1];
1695 jnrC = jjnr[jidx+2];
1696 jnrD = jjnr[jidx+3];
1697 jnrE = jjnr[jidx+4];
1698 jnrF = jjnr[jidx+5];
1699 jnrG = jjnr[jidx+6];
1700 jnrH = jjnr[jidx+7];
1701 j_coord_offsetA = DIM*jnrA;
1702 j_coord_offsetB = DIM*jnrB;
1703 j_coord_offsetC = DIM*jnrC;
1704 j_coord_offsetD = DIM*jnrD;
1705 j_coord_offsetE = DIM*jnrE;
1706 j_coord_offsetF = DIM*jnrF;
1707 j_coord_offsetG = DIM*jnrG;
1708 j_coord_offsetH = DIM*jnrH;
1710 /* load j atom coordinates */
1711 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1712 x+j_coord_offsetC,x+j_coord_offsetD,
1713 x+j_coord_offsetE,x+j_coord_offsetF,
1714 x+j_coord_offsetG,x+j_coord_offsetH,
1715 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1716 &jy2,&jz2,&jx3,&jy3,&jz3);
1718 /* Calculate displacement vector */
1719 dx00 = _mm256_sub_ps(ix0,jx0);
1720 dy00 = _mm256_sub_ps(iy0,jy0);
1721 dz00 = _mm256_sub_ps(iz0,jz0);
1722 dx11 = _mm256_sub_ps(ix1,jx1);
1723 dy11 = _mm256_sub_ps(iy1,jy1);
1724 dz11 = _mm256_sub_ps(iz1,jz1);
1725 dx12 = _mm256_sub_ps(ix1,jx2);
1726 dy12 = _mm256_sub_ps(iy1,jy2);
1727 dz12 = _mm256_sub_ps(iz1,jz2);
1728 dx13 = _mm256_sub_ps(ix1,jx3);
1729 dy13 = _mm256_sub_ps(iy1,jy3);
1730 dz13 = _mm256_sub_ps(iz1,jz3);
1731 dx21 = _mm256_sub_ps(ix2,jx1);
1732 dy21 = _mm256_sub_ps(iy2,jy1);
1733 dz21 = _mm256_sub_ps(iz2,jz1);
1734 dx22 = _mm256_sub_ps(ix2,jx2);
1735 dy22 = _mm256_sub_ps(iy2,jy2);
1736 dz22 = _mm256_sub_ps(iz2,jz2);
1737 dx23 = _mm256_sub_ps(ix2,jx3);
1738 dy23 = _mm256_sub_ps(iy2,jy3);
1739 dz23 = _mm256_sub_ps(iz2,jz3);
1740 dx31 = _mm256_sub_ps(ix3,jx1);
1741 dy31 = _mm256_sub_ps(iy3,jy1);
1742 dz31 = _mm256_sub_ps(iz3,jz1);
1743 dx32 = _mm256_sub_ps(ix3,jx2);
1744 dy32 = _mm256_sub_ps(iy3,jy2);
1745 dz32 = _mm256_sub_ps(iz3,jz2);
1746 dx33 = _mm256_sub_ps(ix3,jx3);
1747 dy33 = _mm256_sub_ps(iy3,jy3);
1748 dz33 = _mm256_sub_ps(iz3,jz3);
1750 /* Calculate squared distance and things based on it */
1751 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1752 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1753 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1754 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1755 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1756 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1757 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1758 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1759 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1760 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1762 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1763 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1764 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1765 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1766 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1767 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1768 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1769 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1770 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1772 rinvsq00 = gmx_mm256_inv_ps(rsq00);
1774 fjx0 = _mm256_setzero_ps();
1775 fjy0 = _mm256_setzero_ps();
1776 fjz0 = _mm256_setzero_ps();
1777 fjx1 = _mm256_setzero_ps();
1778 fjy1 = _mm256_setzero_ps();
1779 fjz1 = _mm256_setzero_ps();
1780 fjx2 = _mm256_setzero_ps();
1781 fjy2 = _mm256_setzero_ps();
1782 fjz2 = _mm256_setzero_ps();
1783 fjx3 = _mm256_setzero_ps();
1784 fjy3 = _mm256_setzero_ps();
1785 fjz3 = _mm256_setzero_ps();
1787 /**************************
1788 * CALCULATE INTERACTIONS *
1789 **************************/
1791 /* LENNARD-JONES DISPERSION/REPULSION */
1793 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1794 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1798 /* Calculate temporary vectorial force */
1799 tx = _mm256_mul_ps(fscal,dx00);
1800 ty = _mm256_mul_ps(fscal,dy00);
1801 tz = _mm256_mul_ps(fscal,dz00);
1803 /* Update vectorial force */
1804 fix0 = _mm256_add_ps(fix0,tx);
1805 fiy0 = _mm256_add_ps(fiy0,ty);
1806 fiz0 = _mm256_add_ps(fiz0,tz);
1808 fjx0 = _mm256_add_ps(fjx0,tx);
1809 fjy0 = _mm256_add_ps(fjy0,ty);
1810 fjz0 = _mm256_add_ps(fjz0,tz);
1812 /**************************
1813 * CALCULATE INTERACTIONS *
1814 **************************/
1816 r11 = _mm256_mul_ps(rsq11,rinv11);
1818 /* Calculate table index by multiplying r with table scale and truncate to integer */
1819 rt = _mm256_mul_ps(r11,vftabscale);
1820 vfitab = _mm256_cvttps_epi32(rt);
1821 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1822 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1823 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1824 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1825 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1826 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1828 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1829 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1830 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1831 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1832 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1833 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1834 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1835 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1836 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1837 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1838 Heps = _mm256_mul_ps(vfeps,H);
1839 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1840 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1841 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1845 /* Calculate temporary vectorial force */
1846 tx = _mm256_mul_ps(fscal,dx11);
1847 ty = _mm256_mul_ps(fscal,dy11);
1848 tz = _mm256_mul_ps(fscal,dz11);
1850 /* Update vectorial force */
1851 fix1 = _mm256_add_ps(fix1,tx);
1852 fiy1 = _mm256_add_ps(fiy1,ty);
1853 fiz1 = _mm256_add_ps(fiz1,tz);
1855 fjx1 = _mm256_add_ps(fjx1,tx);
1856 fjy1 = _mm256_add_ps(fjy1,ty);
1857 fjz1 = _mm256_add_ps(fjz1,tz);
1859 /**************************
1860 * CALCULATE INTERACTIONS *
1861 **************************/
1863 r12 = _mm256_mul_ps(rsq12,rinv12);
1865 /* Calculate table index by multiplying r with table scale and truncate to integer */
1866 rt = _mm256_mul_ps(r12,vftabscale);
1867 vfitab = _mm256_cvttps_epi32(rt);
1868 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1869 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1870 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1871 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1872 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1873 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1875 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1876 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1877 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1878 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1879 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1880 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1881 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1882 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1883 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1884 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1885 Heps = _mm256_mul_ps(vfeps,H);
1886 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1887 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1888 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1892 /* Calculate temporary vectorial force */
1893 tx = _mm256_mul_ps(fscal,dx12);
1894 ty = _mm256_mul_ps(fscal,dy12);
1895 tz = _mm256_mul_ps(fscal,dz12);
1897 /* Update vectorial force */
1898 fix1 = _mm256_add_ps(fix1,tx);
1899 fiy1 = _mm256_add_ps(fiy1,ty);
1900 fiz1 = _mm256_add_ps(fiz1,tz);
1902 fjx2 = _mm256_add_ps(fjx2,tx);
1903 fjy2 = _mm256_add_ps(fjy2,ty);
1904 fjz2 = _mm256_add_ps(fjz2,tz);
1906 /**************************
1907 * CALCULATE INTERACTIONS *
1908 **************************/
1910 r13 = _mm256_mul_ps(rsq13,rinv13);
1912 /* Calculate table index by multiplying r with table scale and truncate to integer */
1913 rt = _mm256_mul_ps(r13,vftabscale);
1914 vfitab = _mm256_cvttps_epi32(rt);
1915 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1916 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1917 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1918 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1919 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1920 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1922 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1923 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1924 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1925 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1926 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1927 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1928 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1929 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1930 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1931 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1932 Heps = _mm256_mul_ps(vfeps,H);
1933 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1934 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1935 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
1939 /* Calculate temporary vectorial force */
1940 tx = _mm256_mul_ps(fscal,dx13);
1941 ty = _mm256_mul_ps(fscal,dy13);
1942 tz = _mm256_mul_ps(fscal,dz13);
1944 /* Update vectorial force */
1945 fix1 = _mm256_add_ps(fix1,tx);
1946 fiy1 = _mm256_add_ps(fiy1,ty);
1947 fiz1 = _mm256_add_ps(fiz1,tz);
1949 fjx3 = _mm256_add_ps(fjx3,tx);
1950 fjy3 = _mm256_add_ps(fjy3,ty);
1951 fjz3 = _mm256_add_ps(fjz3,tz);
1953 /**************************
1954 * CALCULATE INTERACTIONS *
1955 **************************/
1957 r21 = _mm256_mul_ps(rsq21,rinv21);
1959 /* Calculate table index by multiplying r with table scale and truncate to integer */
1960 rt = _mm256_mul_ps(r21,vftabscale);
1961 vfitab = _mm256_cvttps_epi32(rt);
1962 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1963 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1964 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1965 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1966 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1967 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1969 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1970 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1971 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1972 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1973 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1974 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1975 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1976 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1977 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1978 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1979 Heps = _mm256_mul_ps(vfeps,H);
1980 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1981 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1982 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1986 /* Calculate temporary vectorial force */
1987 tx = _mm256_mul_ps(fscal,dx21);
1988 ty = _mm256_mul_ps(fscal,dy21);
1989 tz = _mm256_mul_ps(fscal,dz21);
1991 /* Update vectorial force */
1992 fix2 = _mm256_add_ps(fix2,tx);
1993 fiy2 = _mm256_add_ps(fiy2,ty);
1994 fiz2 = _mm256_add_ps(fiz2,tz);
1996 fjx1 = _mm256_add_ps(fjx1,tx);
1997 fjy1 = _mm256_add_ps(fjy1,ty);
1998 fjz1 = _mm256_add_ps(fjz1,tz);
2000 /**************************
2001 * CALCULATE INTERACTIONS *
2002 **************************/
2004 r22 = _mm256_mul_ps(rsq22,rinv22);
2006 /* Calculate table index by multiplying r with table scale and truncate to integer */
2007 rt = _mm256_mul_ps(r22,vftabscale);
2008 vfitab = _mm256_cvttps_epi32(rt);
2009 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2010 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2011 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2012 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2013 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2014 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2016 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2017 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2018 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2019 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2020 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2021 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2022 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2023 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2024 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2025 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2026 Heps = _mm256_mul_ps(vfeps,H);
2027 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2028 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2029 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2033 /* Calculate temporary vectorial force */
2034 tx = _mm256_mul_ps(fscal,dx22);
2035 ty = _mm256_mul_ps(fscal,dy22);
2036 tz = _mm256_mul_ps(fscal,dz22);
2038 /* Update vectorial force */
2039 fix2 = _mm256_add_ps(fix2,tx);
2040 fiy2 = _mm256_add_ps(fiy2,ty);
2041 fiz2 = _mm256_add_ps(fiz2,tz);
2043 fjx2 = _mm256_add_ps(fjx2,tx);
2044 fjy2 = _mm256_add_ps(fjy2,ty);
2045 fjz2 = _mm256_add_ps(fjz2,tz);
2047 /**************************
2048 * CALCULATE INTERACTIONS *
2049 **************************/
2051 r23 = _mm256_mul_ps(rsq23,rinv23);
2053 /* Calculate table index by multiplying r with table scale and truncate to integer */
2054 rt = _mm256_mul_ps(r23,vftabscale);
2055 vfitab = _mm256_cvttps_epi32(rt);
2056 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2057 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2058 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2059 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2060 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2061 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2063 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2064 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2065 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2066 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2067 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2068 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2069 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2070 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2071 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2072 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2073 Heps = _mm256_mul_ps(vfeps,H);
2074 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2075 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2076 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
2080 /* Calculate temporary vectorial force */
2081 tx = _mm256_mul_ps(fscal,dx23);
2082 ty = _mm256_mul_ps(fscal,dy23);
2083 tz = _mm256_mul_ps(fscal,dz23);
2085 /* Update vectorial force */
2086 fix2 = _mm256_add_ps(fix2,tx);
2087 fiy2 = _mm256_add_ps(fiy2,ty);
2088 fiz2 = _mm256_add_ps(fiz2,tz);
2090 fjx3 = _mm256_add_ps(fjx3,tx);
2091 fjy3 = _mm256_add_ps(fjy3,ty);
2092 fjz3 = _mm256_add_ps(fjz3,tz);
2094 /**************************
2095 * CALCULATE INTERACTIONS *
2096 **************************/
2098 r31 = _mm256_mul_ps(rsq31,rinv31);
2100 /* Calculate table index by multiplying r with table scale and truncate to integer */
2101 rt = _mm256_mul_ps(r31,vftabscale);
2102 vfitab = _mm256_cvttps_epi32(rt);
2103 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2104 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2105 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2106 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2107 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2108 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2110 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2111 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2112 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2113 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2114 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2115 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2116 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2117 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2118 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2119 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2120 Heps = _mm256_mul_ps(vfeps,H);
2121 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2122 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2123 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
2127 /* Calculate temporary vectorial force */
2128 tx = _mm256_mul_ps(fscal,dx31);
2129 ty = _mm256_mul_ps(fscal,dy31);
2130 tz = _mm256_mul_ps(fscal,dz31);
2132 /* Update vectorial force */
2133 fix3 = _mm256_add_ps(fix3,tx);
2134 fiy3 = _mm256_add_ps(fiy3,ty);
2135 fiz3 = _mm256_add_ps(fiz3,tz);
2137 fjx1 = _mm256_add_ps(fjx1,tx);
2138 fjy1 = _mm256_add_ps(fjy1,ty);
2139 fjz1 = _mm256_add_ps(fjz1,tz);
2141 /**************************
2142 * CALCULATE INTERACTIONS *
2143 **************************/
2145 r32 = _mm256_mul_ps(rsq32,rinv32);
2147 /* Calculate table index by multiplying r with table scale and truncate to integer */
2148 rt = _mm256_mul_ps(r32,vftabscale);
2149 vfitab = _mm256_cvttps_epi32(rt);
2150 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2151 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2152 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2153 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2154 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2155 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2157 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2158 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2159 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2160 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2161 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2162 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2163 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2164 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2165 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2166 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2167 Heps = _mm256_mul_ps(vfeps,H);
2168 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2169 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2170 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2174 /* Calculate temporary vectorial force */
2175 tx = _mm256_mul_ps(fscal,dx32);
2176 ty = _mm256_mul_ps(fscal,dy32);
2177 tz = _mm256_mul_ps(fscal,dz32);
2179 /* Update vectorial force */
2180 fix3 = _mm256_add_ps(fix3,tx);
2181 fiy3 = _mm256_add_ps(fiy3,ty);
2182 fiz3 = _mm256_add_ps(fiz3,tz);
2184 fjx2 = _mm256_add_ps(fjx2,tx);
2185 fjy2 = _mm256_add_ps(fjy2,ty);
2186 fjz2 = _mm256_add_ps(fjz2,tz);
2188 /**************************
2189 * CALCULATE INTERACTIONS *
2190 **************************/
2192 r33 = _mm256_mul_ps(rsq33,rinv33);
2194 /* Calculate table index by multiplying r with table scale and truncate to integer */
2195 rt = _mm256_mul_ps(r33,vftabscale);
2196 vfitab = _mm256_cvttps_epi32(rt);
2197 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2198 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2199 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2200 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2201 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2202 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2204 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2205 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2206 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2207 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2208 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2209 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2210 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2211 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2212 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2213 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2214 Heps = _mm256_mul_ps(vfeps,H);
2215 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2216 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2217 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2221 /* Calculate temporary vectorial force */
2222 tx = _mm256_mul_ps(fscal,dx33);
2223 ty = _mm256_mul_ps(fscal,dy33);
2224 tz = _mm256_mul_ps(fscal,dz33);
2226 /* Update vectorial force */
2227 fix3 = _mm256_add_ps(fix3,tx);
2228 fiy3 = _mm256_add_ps(fiy3,ty);
2229 fiz3 = _mm256_add_ps(fiz3,tz);
2231 fjx3 = _mm256_add_ps(fjx3,tx);
2232 fjy3 = _mm256_add_ps(fjy3,ty);
2233 fjz3 = _mm256_add_ps(fjz3,tz);
2235 fjptrA = f+j_coord_offsetA;
2236 fjptrB = f+j_coord_offsetB;
2237 fjptrC = f+j_coord_offsetC;
2238 fjptrD = f+j_coord_offsetD;
2239 fjptrE = f+j_coord_offsetE;
2240 fjptrF = f+j_coord_offsetF;
2241 fjptrG = f+j_coord_offsetG;
2242 fjptrH = f+j_coord_offsetH;
2244 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2245 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2246 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2248 /* Inner loop uses 381 flops */
2251 if(jidx<j_index_end)
2254 /* Get j neighbor index, and coordinate index */
2255 jnrlistA = jjnr[jidx];
2256 jnrlistB = jjnr[jidx+1];
2257 jnrlistC = jjnr[jidx+2];
2258 jnrlistD = jjnr[jidx+3];
2259 jnrlistE = jjnr[jidx+4];
2260 jnrlistF = jjnr[jidx+5];
2261 jnrlistG = jjnr[jidx+6];
2262 jnrlistH = jjnr[jidx+7];
2263 /* Sign of each element will be negative for non-real atoms.
2264 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2265 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2267 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2268 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2270 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2271 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2272 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2273 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2274 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2275 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2276 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2277 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2278 j_coord_offsetA = DIM*jnrA;
2279 j_coord_offsetB = DIM*jnrB;
2280 j_coord_offsetC = DIM*jnrC;
2281 j_coord_offsetD = DIM*jnrD;
2282 j_coord_offsetE = DIM*jnrE;
2283 j_coord_offsetF = DIM*jnrF;
2284 j_coord_offsetG = DIM*jnrG;
2285 j_coord_offsetH = DIM*jnrH;
2287 /* load j atom coordinates */
2288 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2289 x+j_coord_offsetC,x+j_coord_offsetD,
2290 x+j_coord_offsetE,x+j_coord_offsetF,
2291 x+j_coord_offsetG,x+j_coord_offsetH,
2292 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2293 &jy2,&jz2,&jx3,&jy3,&jz3);
2295 /* Calculate displacement vector */
2296 dx00 = _mm256_sub_ps(ix0,jx0);
2297 dy00 = _mm256_sub_ps(iy0,jy0);
2298 dz00 = _mm256_sub_ps(iz0,jz0);
2299 dx11 = _mm256_sub_ps(ix1,jx1);
2300 dy11 = _mm256_sub_ps(iy1,jy1);
2301 dz11 = _mm256_sub_ps(iz1,jz1);
2302 dx12 = _mm256_sub_ps(ix1,jx2);
2303 dy12 = _mm256_sub_ps(iy1,jy2);
2304 dz12 = _mm256_sub_ps(iz1,jz2);
2305 dx13 = _mm256_sub_ps(ix1,jx3);
2306 dy13 = _mm256_sub_ps(iy1,jy3);
2307 dz13 = _mm256_sub_ps(iz1,jz3);
2308 dx21 = _mm256_sub_ps(ix2,jx1);
2309 dy21 = _mm256_sub_ps(iy2,jy1);
2310 dz21 = _mm256_sub_ps(iz2,jz1);
2311 dx22 = _mm256_sub_ps(ix2,jx2);
2312 dy22 = _mm256_sub_ps(iy2,jy2);
2313 dz22 = _mm256_sub_ps(iz2,jz2);
2314 dx23 = _mm256_sub_ps(ix2,jx3);
2315 dy23 = _mm256_sub_ps(iy2,jy3);
2316 dz23 = _mm256_sub_ps(iz2,jz3);
2317 dx31 = _mm256_sub_ps(ix3,jx1);
2318 dy31 = _mm256_sub_ps(iy3,jy1);
2319 dz31 = _mm256_sub_ps(iz3,jz1);
2320 dx32 = _mm256_sub_ps(ix3,jx2);
2321 dy32 = _mm256_sub_ps(iy3,jy2);
2322 dz32 = _mm256_sub_ps(iz3,jz2);
2323 dx33 = _mm256_sub_ps(ix3,jx3);
2324 dy33 = _mm256_sub_ps(iy3,jy3);
2325 dz33 = _mm256_sub_ps(iz3,jz3);
2327 /* Calculate squared distance and things based on it */
2328 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2329 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2330 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2331 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2332 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2333 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2334 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2335 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2336 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2337 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2339 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2340 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2341 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2342 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2343 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2344 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2345 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2346 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2347 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2349 rinvsq00 = gmx_mm256_inv_ps(rsq00);
2351 fjx0 = _mm256_setzero_ps();
2352 fjy0 = _mm256_setzero_ps();
2353 fjz0 = _mm256_setzero_ps();
2354 fjx1 = _mm256_setzero_ps();
2355 fjy1 = _mm256_setzero_ps();
2356 fjz1 = _mm256_setzero_ps();
2357 fjx2 = _mm256_setzero_ps();
2358 fjy2 = _mm256_setzero_ps();
2359 fjz2 = _mm256_setzero_ps();
2360 fjx3 = _mm256_setzero_ps();
2361 fjy3 = _mm256_setzero_ps();
2362 fjz3 = _mm256_setzero_ps();
2364 /**************************
2365 * CALCULATE INTERACTIONS *
2366 **************************/
2368 /* LENNARD-JONES DISPERSION/REPULSION */
2370 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2371 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2375 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2377 /* Calculate temporary vectorial force */
2378 tx = _mm256_mul_ps(fscal,dx00);
2379 ty = _mm256_mul_ps(fscal,dy00);
2380 tz = _mm256_mul_ps(fscal,dz00);
2382 /* Update vectorial force */
2383 fix0 = _mm256_add_ps(fix0,tx);
2384 fiy0 = _mm256_add_ps(fiy0,ty);
2385 fiz0 = _mm256_add_ps(fiz0,tz);
2387 fjx0 = _mm256_add_ps(fjx0,tx);
2388 fjy0 = _mm256_add_ps(fjy0,ty);
2389 fjz0 = _mm256_add_ps(fjz0,tz);
2391 /**************************
2392 * CALCULATE INTERACTIONS *
2393 **************************/
2395 r11 = _mm256_mul_ps(rsq11,rinv11);
2396 r11 = _mm256_andnot_ps(dummy_mask,r11);
2398 /* Calculate table index by multiplying r with table scale and truncate to integer */
2399 rt = _mm256_mul_ps(r11,vftabscale);
2400 vfitab = _mm256_cvttps_epi32(rt);
2401 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2402 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2403 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2404 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2405 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2406 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2408 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2409 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2410 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2411 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2412 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2413 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2414 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2415 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2416 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2417 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2418 Heps = _mm256_mul_ps(vfeps,H);
2419 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2420 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2421 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2425 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2427 /* Calculate temporary vectorial force */
2428 tx = _mm256_mul_ps(fscal,dx11);
2429 ty = _mm256_mul_ps(fscal,dy11);
2430 tz = _mm256_mul_ps(fscal,dz11);
2432 /* Update vectorial force */
2433 fix1 = _mm256_add_ps(fix1,tx);
2434 fiy1 = _mm256_add_ps(fiy1,ty);
2435 fiz1 = _mm256_add_ps(fiz1,tz);
2437 fjx1 = _mm256_add_ps(fjx1,tx);
2438 fjy1 = _mm256_add_ps(fjy1,ty);
2439 fjz1 = _mm256_add_ps(fjz1,tz);
2441 /**************************
2442 * CALCULATE INTERACTIONS *
2443 **************************/
2445 r12 = _mm256_mul_ps(rsq12,rinv12);
2446 r12 = _mm256_andnot_ps(dummy_mask,r12);
2448 /* Calculate table index by multiplying r with table scale and truncate to integer */
2449 rt = _mm256_mul_ps(r12,vftabscale);
2450 vfitab = _mm256_cvttps_epi32(rt);
2451 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2452 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2453 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2454 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2455 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2456 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2458 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2459 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2460 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2461 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2462 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2463 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2464 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2465 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2466 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2467 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2468 Heps = _mm256_mul_ps(vfeps,H);
2469 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2470 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2471 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2475 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2477 /* Calculate temporary vectorial force */
2478 tx = _mm256_mul_ps(fscal,dx12);
2479 ty = _mm256_mul_ps(fscal,dy12);
2480 tz = _mm256_mul_ps(fscal,dz12);
2482 /* Update vectorial force */
2483 fix1 = _mm256_add_ps(fix1,tx);
2484 fiy1 = _mm256_add_ps(fiy1,ty);
2485 fiz1 = _mm256_add_ps(fiz1,tz);
2487 fjx2 = _mm256_add_ps(fjx2,tx);
2488 fjy2 = _mm256_add_ps(fjy2,ty);
2489 fjz2 = _mm256_add_ps(fjz2,tz);
2491 /**************************
2492 * CALCULATE INTERACTIONS *
2493 **************************/
2495 r13 = _mm256_mul_ps(rsq13,rinv13);
2496 r13 = _mm256_andnot_ps(dummy_mask,r13);
2498 /* Calculate table index by multiplying r with table scale and truncate to integer */
2499 rt = _mm256_mul_ps(r13,vftabscale);
2500 vfitab = _mm256_cvttps_epi32(rt);
2501 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2502 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2503 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2504 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2505 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2506 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2508 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2509 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2510 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2511 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2512 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2513 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2514 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2515 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2516 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2517 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2518 Heps = _mm256_mul_ps(vfeps,H);
2519 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2520 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2521 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
2525 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2527 /* Calculate temporary vectorial force */
2528 tx = _mm256_mul_ps(fscal,dx13);
2529 ty = _mm256_mul_ps(fscal,dy13);
2530 tz = _mm256_mul_ps(fscal,dz13);
2532 /* Update vectorial force */
2533 fix1 = _mm256_add_ps(fix1,tx);
2534 fiy1 = _mm256_add_ps(fiy1,ty);
2535 fiz1 = _mm256_add_ps(fiz1,tz);
2537 fjx3 = _mm256_add_ps(fjx3,tx);
2538 fjy3 = _mm256_add_ps(fjy3,ty);
2539 fjz3 = _mm256_add_ps(fjz3,tz);
2541 /**************************
2542 * CALCULATE INTERACTIONS *
2543 **************************/
2545 r21 = _mm256_mul_ps(rsq21,rinv21);
2546 r21 = _mm256_andnot_ps(dummy_mask,r21);
2548 /* Calculate table index by multiplying r with table scale and truncate to integer */
2549 rt = _mm256_mul_ps(r21,vftabscale);
2550 vfitab = _mm256_cvttps_epi32(rt);
2551 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2552 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2553 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2554 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2555 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2556 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2558 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2559 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2560 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2561 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2562 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2563 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2564 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2565 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2566 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2567 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2568 Heps = _mm256_mul_ps(vfeps,H);
2569 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2570 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2571 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2575 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2577 /* Calculate temporary vectorial force */
2578 tx = _mm256_mul_ps(fscal,dx21);
2579 ty = _mm256_mul_ps(fscal,dy21);
2580 tz = _mm256_mul_ps(fscal,dz21);
2582 /* Update vectorial force */
2583 fix2 = _mm256_add_ps(fix2,tx);
2584 fiy2 = _mm256_add_ps(fiy2,ty);
2585 fiz2 = _mm256_add_ps(fiz2,tz);
2587 fjx1 = _mm256_add_ps(fjx1,tx);
2588 fjy1 = _mm256_add_ps(fjy1,ty);
2589 fjz1 = _mm256_add_ps(fjz1,tz);
2591 /**************************
2592 * CALCULATE INTERACTIONS *
2593 **************************/
2595 r22 = _mm256_mul_ps(rsq22,rinv22);
2596 r22 = _mm256_andnot_ps(dummy_mask,r22);
2598 /* Calculate table index by multiplying r with table scale and truncate to integer */
2599 rt = _mm256_mul_ps(r22,vftabscale);
2600 vfitab = _mm256_cvttps_epi32(rt);
2601 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2602 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2603 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2604 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2605 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2606 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2608 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2609 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2610 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2611 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2612 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2613 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2614 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2615 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2616 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2617 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2618 Heps = _mm256_mul_ps(vfeps,H);
2619 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2620 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2621 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2625 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2627 /* Calculate temporary vectorial force */
2628 tx = _mm256_mul_ps(fscal,dx22);
2629 ty = _mm256_mul_ps(fscal,dy22);
2630 tz = _mm256_mul_ps(fscal,dz22);
2632 /* Update vectorial force */
2633 fix2 = _mm256_add_ps(fix2,tx);
2634 fiy2 = _mm256_add_ps(fiy2,ty);
2635 fiz2 = _mm256_add_ps(fiz2,tz);
2637 fjx2 = _mm256_add_ps(fjx2,tx);
2638 fjy2 = _mm256_add_ps(fjy2,ty);
2639 fjz2 = _mm256_add_ps(fjz2,tz);
2641 /**************************
2642 * CALCULATE INTERACTIONS *
2643 **************************/
2645 r23 = _mm256_mul_ps(rsq23,rinv23);
2646 r23 = _mm256_andnot_ps(dummy_mask,r23);
2648 /* Calculate table index by multiplying r with table scale and truncate to integer */
2649 rt = _mm256_mul_ps(r23,vftabscale);
2650 vfitab = _mm256_cvttps_epi32(rt);
2651 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2652 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2653 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2654 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2655 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2656 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2658 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2659 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2660 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2661 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2662 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2663 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2664 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2665 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2666 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2667 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2668 Heps = _mm256_mul_ps(vfeps,H);
2669 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2670 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2671 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
2675 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2677 /* Calculate temporary vectorial force */
2678 tx = _mm256_mul_ps(fscal,dx23);
2679 ty = _mm256_mul_ps(fscal,dy23);
2680 tz = _mm256_mul_ps(fscal,dz23);
2682 /* Update vectorial force */
2683 fix2 = _mm256_add_ps(fix2,tx);
2684 fiy2 = _mm256_add_ps(fiy2,ty);
2685 fiz2 = _mm256_add_ps(fiz2,tz);
2687 fjx3 = _mm256_add_ps(fjx3,tx);
2688 fjy3 = _mm256_add_ps(fjy3,ty);
2689 fjz3 = _mm256_add_ps(fjz3,tz);
2691 /**************************
2692 * CALCULATE INTERACTIONS *
2693 **************************/
2695 r31 = _mm256_mul_ps(rsq31,rinv31);
2696 r31 = _mm256_andnot_ps(dummy_mask,r31);
2698 /* Calculate table index by multiplying r with table scale and truncate to integer */
2699 rt = _mm256_mul_ps(r31,vftabscale);
2700 vfitab = _mm256_cvttps_epi32(rt);
2701 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2702 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2703 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2704 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2705 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2706 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2708 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2709 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2710 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2711 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2712 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2713 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2714 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2715 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2716 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2717 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2718 Heps = _mm256_mul_ps(vfeps,H);
2719 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2720 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2721 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
2725 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2727 /* Calculate temporary vectorial force */
2728 tx = _mm256_mul_ps(fscal,dx31);
2729 ty = _mm256_mul_ps(fscal,dy31);
2730 tz = _mm256_mul_ps(fscal,dz31);
2732 /* Update vectorial force */
2733 fix3 = _mm256_add_ps(fix3,tx);
2734 fiy3 = _mm256_add_ps(fiy3,ty);
2735 fiz3 = _mm256_add_ps(fiz3,tz);
2737 fjx1 = _mm256_add_ps(fjx1,tx);
2738 fjy1 = _mm256_add_ps(fjy1,ty);
2739 fjz1 = _mm256_add_ps(fjz1,tz);
2741 /**************************
2742 * CALCULATE INTERACTIONS *
2743 **************************/
2745 r32 = _mm256_mul_ps(rsq32,rinv32);
2746 r32 = _mm256_andnot_ps(dummy_mask,r32);
2748 /* Calculate table index by multiplying r with table scale and truncate to integer */
2749 rt = _mm256_mul_ps(r32,vftabscale);
2750 vfitab = _mm256_cvttps_epi32(rt);
2751 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2752 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2753 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2754 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2755 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2756 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2758 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2759 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2760 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2761 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2762 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2763 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2764 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2765 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2766 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2767 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2768 Heps = _mm256_mul_ps(vfeps,H);
2769 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2770 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2771 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2775 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2777 /* Calculate temporary vectorial force */
2778 tx = _mm256_mul_ps(fscal,dx32);
2779 ty = _mm256_mul_ps(fscal,dy32);
2780 tz = _mm256_mul_ps(fscal,dz32);
2782 /* Update vectorial force */
2783 fix3 = _mm256_add_ps(fix3,tx);
2784 fiy3 = _mm256_add_ps(fiy3,ty);
2785 fiz3 = _mm256_add_ps(fiz3,tz);
2787 fjx2 = _mm256_add_ps(fjx2,tx);
2788 fjy2 = _mm256_add_ps(fjy2,ty);
2789 fjz2 = _mm256_add_ps(fjz2,tz);
2791 /**************************
2792 * CALCULATE INTERACTIONS *
2793 **************************/
2795 r33 = _mm256_mul_ps(rsq33,rinv33);
2796 r33 = _mm256_andnot_ps(dummy_mask,r33);
2798 /* Calculate table index by multiplying r with table scale and truncate to integer */
2799 rt = _mm256_mul_ps(r33,vftabscale);
2800 vfitab = _mm256_cvttps_epi32(rt);
2801 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2802 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2803 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2804 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2805 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2806 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2808 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2809 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2810 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2811 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2812 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2813 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2814 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2815 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2816 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2817 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2818 Heps = _mm256_mul_ps(vfeps,H);
2819 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2820 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2821 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2825 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2827 /* Calculate temporary vectorial force */
2828 tx = _mm256_mul_ps(fscal,dx33);
2829 ty = _mm256_mul_ps(fscal,dy33);
2830 tz = _mm256_mul_ps(fscal,dz33);
2832 /* Update vectorial force */
2833 fix3 = _mm256_add_ps(fix3,tx);
2834 fiy3 = _mm256_add_ps(fiy3,ty);
2835 fiz3 = _mm256_add_ps(fiz3,tz);
2837 fjx3 = _mm256_add_ps(fjx3,tx);
2838 fjy3 = _mm256_add_ps(fjy3,ty);
2839 fjz3 = _mm256_add_ps(fjz3,tz);
2841 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2842 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2843 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2844 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2845 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2846 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2847 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2848 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2850 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2851 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2852 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2854 /* Inner loop uses 390 flops */
2857 /* End of innermost loop */
2859 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2860 f+i_coord_offset,fshift+i_shift_offset);
2862 /* Increment number of inner iterations */
2863 inneriter += j_index_end - j_index_start;
2865 /* Outer loop uses 24 flops */
2868 /* Increment number of outer iterations */
2871 /* Update outer/inner flops */
2873 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*390);