2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
96 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
100 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
102 __m128i vfitab_lo,vfitab_hi;
103 __m128i ifour = _mm_set1_epi32(4);
104 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
106 __m256 dummy_mask,cutoff_mask;
107 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
108 __m256 one = _mm256_set1_ps(1.0);
109 __m256 two = _mm256_set1_ps(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm256_set1_ps(fr->epsfac);
122 charge = mdatoms->chargeA;
123 nvdwtype = fr->ntype;
125 vdwtype = mdatoms->typeA;
127 vftab = kernel_data->table_elec->data;
128 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
130 /* Setup water-specific parameters */
131 inr = nlist->iinr[0];
132 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
133 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
134 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
135 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
137 jq0 = _mm256_set1_ps(charge[inr+0]);
138 jq1 = _mm256_set1_ps(charge[inr+1]);
139 jq2 = _mm256_set1_ps(charge[inr+2]);
140 vdwjidx0A = 2*vdwtype[inr+0];
141 qq00 = _mm256_mul_ps(iq0,jq0);
142 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
143 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
144 qq01 = _mm256_mul_ps(iq0,jq1);
145 qq02 = _mm256_mul_ps(iq0,jq2);
146 qq10 = _mm256_mul_ps(iq1,jq0);
147 qq11 = _mm256_mul_ps(iq1,jq1);
148 qq12 = _mm256_mul_ps(iq1,jq2);
149 qq20 = _mm256_mul_ps(iq2,jq0);
150 qq21 = _mm256_mul_ps(iq2,jq1);
151 qq22 = _mm256_mul_ps(iq2,jq2);
153 /* Avoid stupid compiler warnings */
154 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
167 for(iidx=0;iidx<4*DIM;iidx++)
172 /* Start outer loop over neighborlists */
173 for(iidx=0; iidx<nri; iidx++)
175 /* Load shift vector for this list */
176 i_shift_offset = DIM*shiftidx[iidx];
178 /* Load limits for loop over neighbors */
179 j_index_start = jindex[iidx];
180 j_index_end = jindex[iidx+1];
182 /* Get outer coordinate index */
184 i_coord_offset = DIM*inr;
186 /* Load i particle coords and add shift vector */
187 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
188 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
190 fix0 = _mm256_setzero_ps();
191 fiy0 = _mm256_setzero_ps();
192 fiz0 = _mm256_setzero_ps();
193 fix1 = _mm256_setzero_ps();
194 fiy1 = _mm256_setzero_ps();
195 fiz1 = _mm256_setzero_ps();
196 fix2 = _mm256_setzero_ps();
197 fiy2 = _mm256_setzero_ps();
198 fiz2 = _mm256_setzero_ps();
200 /* Reset potential sums */
201 velecsum = _mm256_setzero_ps();
202 vvdwsum = _mm256_setzero_ps();
204 /* Start inner kernel loop */
205 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
208 /* Get j neighbor index, and coordinate index */
217 j_coord_offsetA = DIM*jnrA;
218 j_coord_offsetB = DIM*jnrB;
219 j_coord_offsetC = DIM*jnrC;
220 j_coord_offsetD = DIM*jnrD;
221 j_coord_offsetE = DIM*jnrE;
222 j_coord_offsetF = DIM*jnrF;
223 j_coord_offsetG = DIM*jnrG;
224 j_coord_offsetH = DIM*jnrH;
226 /* load j atom coordinates */
227 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
228 x+j_coord_offsetC,x+j_coord_offsetD,
229 x+j_coord_offsetE,x+j_coord_offsetF,
230 x+j_coord_offsetG,x+j_coord_offsetH,
231 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
233 /* Calculate displacement vector */
234 dx00 = _mm256_sub_ps(ix0,jx0);
235 dy00 = _mm256_sub_ps(iy0,jy0);
236 dz00 = _mm256_sub_ps(iz0,jz0);
237 dx01 = _mm256_sub_ps(ix0,jx1);
238 dy01 = _mm256_sub_ps(iy0,jy1);
239 dz01 = _mm256_sub_ps(iz0,jz1);
240 dx02 = _mm256_sub_ps(ix0,jx2);
241 dy02 = _mm256_sub_ps(iy0,jy2);
242 dz02 = _mm256_sub_ps(iz0,jz2);
243 dx10 = _mm256_sub_ps(ix1,jx0);
244 dy10 = _mm256_sub_ps(iy1,jy0);
245 dz10 = _mm256_sub_ps(iz1,jz0);
246 dx11 = _mm256_sub_ps(ix1,jx1);
247 dy11 = _mm256_sub_ps(iy1,jy1);
248 dz11 = _mm256_sub_ps(iz1,jz1);
249 dx12 = _mm256_sub_ps(ix1,jx2);
250 dy12 = _mm256_sub_ps(iy1,jy2);
251 dz12 = _mm256_sub_ps(iz1,jz2);
252 dx20 = _mm256_sub_ps(ix2,jx0);
253 dy20 = _mm256_sub_ps(iy2,jy0);
254 dz20 = _mm256_sub_ps(iz2,jz0);
255 dx21 = _mm256_sub_ps(ix2,jx1);
256 dy21 = _mm256_sub_ps(iy2,jy1);
257 dz21 = _mm256_sub_ps(iz2,jz1);
258 dx22 = _mm256_sub_ps(ix2,jx2);
259 dy22 = _mm256_sub_ps(iy2,jy2);
260 dz22 = _mm256_sub_ps(iz2,jz2);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
264 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
265 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
266 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
267 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
268 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
269 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
270 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
271 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
273 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
274 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
275 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
276 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
277 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
278 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
279 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
280 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
281 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
283 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
285 fjx0 = _mm256_setzero_ps();
286 fjy0 = _mm256_setzero_ps();
287 fjz0 = _mm256_setzero_ps();
288 fjx1 = _mm256_setzero_ps();
289 fjy1 = _mm256_setzero_ps();
290 fjz1 = _mm256_setzero_ps();
291 fjx2 = _mm256_setzero_ps();
292 fjy2 = _mm256_setzero_ps();
293 fjz2 = _mm256_setzero_ps();
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 r00 = _mm256_mul_ps(rsq00,rinv00);
301 /* Calculate table index by multiplying r with table scale and truncate to integer */
302 rt = _mm256_mul_ps(r00,vftabscale);
303 vfitab = _mm256_cvttps_epi32(rt);
304 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
305 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
306 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
307 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
308 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
309 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
311 /* CUBIC SPLINE TABLE ELECTROSTATICS */
312 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
313 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
314 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
315 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
316 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
317 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
318 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
319 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
320 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
321 Heps = _mm256_mul_ps(vfeps,H);
322 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
323 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
324 velec = _mm256_mul_ps(qq00,VV);
325 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
326 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
328 /* LENNARD-JONES DISPERSION/REPULSION */
330 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
331 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
332 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
333 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
334 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
336 /* Update potential sum for this i atom from the interaction with this j atom. */
337 velecsum = _mm256_add_ps(velecsum,velec);
338 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
340 fscal = _mm256_add_ps(felec,fvdw);
342 /* Calculate temporary vectorial force */
343 tx = _mm256_mul_ps(fscal,dx00);
344 ty = _mm256_mul_ps(fscal,dy00);
345 tz = _mm256_mul_ps(fscal,dz00);
347 /* Update vectorial force */
348 fix0 = _mm256_add_ps(fix0,tx);
349 fiy0 = _mm256_add_ps(fiy0,ty);
350 fiz0 = _mm256_add_ps(fiz0,tz);
352 fjx0 = _mm256_add_ps(fjx0,tx);
353 fjy0 = _mm256_add_ps(fjy0,ty);
354 fjz0 = _mm256_add_ps(fjz0,tz);
356 /**************************
357 * CALCULATE INTERACTIONS *
358 **************************/
360 r01 = _mm256_mul_ps(rsq01,rinv01);
362 /* Calculate table index by multiplying r with table scale and truncate to integer */
363 rt = _mm256_mul_ps(r01,vftabscale);
364 vfitab = _mm256_cvttps_epi32(rt);
365 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
366 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
367 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
368 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
369 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
370 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
372 /* CUBIC SPLINE TABLE ELECTROSTATICS */
373 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
374 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
375 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
376 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
377 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
378 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
379 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
380 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
381 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
382 Heps = _mm256_mul_ps(vfeps,H);
383 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
384 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
385 velec = _mm256_mul_ps(qq01,VV);
386 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
387 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
389 /* Update potential sum for this i atom from the interaction with this j atom. */
390 velecsum = _mm256_add_ps(velecsum,velec);
394 /* Calculate temporary vectorial force */
395 tx = _mm256_mul_ps(fscal,dx01);
396 ty = _mm256_mul_ps(fscal,dy01);
397 tz = _mm256_mul_ps(fscal,dz01);
399 /* Update vectorial force */
400 fix0 = _mm256_add_ps(fix0,tx);
401 fiy0 = _mm256_add_ps(fiy0,ty);
402 fiz0 = _mm256_add_ps(fiz0,tz);
404 fjx1 = _mm256_add_ps(fjx1,tx);
405 fjy1 = _mm256_add_ps(fjy1,ty);
406 fjz1 = _mm256_add_ps(fjz1,tz);
408 /**************************
409 * CALCULATE INTERACTIONS *
410 **************************/
412 r02 = _mm256_mul_ps(rsq02,rinv02);
414 /* Calculate table index by multiplying r with table scale and truncate to integer */
415 rt = _mm256_mul_ps(r02,vftabscale);
416 vfitab = _mm256_cvttps_epi32(rt);
417 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
418 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
419 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
420 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
421 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
422 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
424 /* CUBIC SPLINE TABLE ELECTROSTATICS */
425 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
426 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
427 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
428 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
429 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
430 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
431 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
432 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
433 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
434 Heps = _mm256_mul_ps(vfeps,H);
435 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
436 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
437 velec = _mm256_mul_ps(qq02,VV);
438 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
439 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
441 /* Update potential sum for this i atom from the interaction with this j atom. */
442 velecsum = _mm256_add_ps(velecsum,velec);
446 /* Calculate temporary vectorial force */
447 tx = _mm256_mul_ps(fscal,dx02);
448 ty = _mm256_mul_ps(fscal,dy02);
449 tz = _mm256_mul_ps(fscal,dz02);
451 /* Update vectorial force */
452 fix0 = _mm256_add_ps(fix0,tx);
453 fiy0 = _mm256_add_ps(fiy0,ty);
454 fiz0 = _mm256_add_ps(fiz0,tz);
456 fjx2 = _mm256_add_ps(fjx2,tx);
457 fjy2 = _mm256_add_ps(fjy2,ty);
458 fjz2 = _mm256_add_ps(fjz2,tz);
460 /**************************
461 * CALCULATE INTERACTIONS *
462 **************************/
464 r10 = _mm256_mul_ps(rsq10,rinv10);
466 /* Calculate table index by multiplying r with table scale and truncate to integer */
467 rt = _mm256_mul_ps(r10,vftabscale);
468 vfitab = _mm256_cvttps_epi32(rt);
469 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
470 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
471 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
472 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
473 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
474 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
476 /* CUBIC SPLINE TABLE ELECTROSTATICS */
477 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
478 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
479 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
480 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
481 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
482 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
483 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
484 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
485 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
486 Heps = _mm256_mul_ps(vfeps,H);
487 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
488 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
489 velec = _mm256_mul_ps(qq10,VV);
490 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
491 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
493 /* Update potential sum for this i atom from the interaction with this j atom. */
494 velecsum = _mm256_add_ps(velecsum,velec);
498 /* Calculate temporary vectorial force */
499 tx = _mm256_mul_ps(fscal,dx10);
500 ty = _mm256_mul_ps(fscal,dy10);
501 tz = _mm256_mul_ps(fscal,dz10);
503 /* Update vectorial force */
504 fix1 = _mm256_add_ps(fix1,tx);
505 fiy1 = _mm256_add_ps(fiy1,ty);
506 fiz1 = _mm256_add_ps(fiz1,tz);
508 fjx0 = _mm256_add_ps(fjx0,tx);
509 fjy0 = _mm256_add_ps(fjy0,ty);
510 fjz0 = _mm256_add_ps(fjz0,tz);
512 /**************************
513 * CALCULATE INTERACTIONS *
514 **************************/
516 r11 = _mm256_mul_ps(rsq11,rinv11);
518 /* Calculate table index by multiplying r with table scale and truncate to integer */
519 rt = _mm256_mul_ps(r11,vftabscale);
520 vfitab = _mm256_cvttps_epi32(rt);
521 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
522 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
523 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
524 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
525 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
526 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
528 /* CUBIC SPLINE TABLE ELECTROSTATICS */
529 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
530 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
531 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
532 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
533 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
534 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
535 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
536 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
537 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
538 Heps = _mm256_mul_ps(vfeps,H);
539 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
540 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
541 velec = _mm256_mul_ps(qq11,VV);
542 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
543 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
545 /* Update potential sum for this i atom from the interaction with this j atom. */
546 velecsum = _mm256_add_ps(velecsum,velec);
550 /* Calculate temporary vectorial force */
551 tx = _mm256_mul_ps(fscal,dx11);
552 ty = _mm256_mul_ps(fscal,dy11);
553 tz = _mm256_mul_ps(fscal,dz11);
555 /* Update vectorial force */
556 fix1 = _mm256_add_ps(fix1,tx);
557 fiy1 = _mm256_add_ps(fiy1,ty);
558 fiz1 = _mm256_add_ps(fiz1,tz);
560 fjx1 = _mm256_add_ps(fjx1,tx);
561 fjy1 = _mm256_add_ps(fjy1,ty);
562 fjz1 = _mm256_add_ps(fjz1,tz);
564 /**************************
565 * CALCULATE INTERACTIONS *
566 **************************/
568 r12 = _mm256_mul_ps(rsq12,rinv12);
570 /* Calculate table index by multiplying r with table scale and truncate to integer */
571 rt = _mm256_mul_ps(r12,vftabscale);
572 vfitab = _mm256_cvttps_epi32(rt);
573 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
574 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
575 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
576 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
577 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
578 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
580 /* CUBIC SPLINE TABLE ELECTROSTATICS */
581 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
582 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
583 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
584 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
585 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
586 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
587 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
588 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
589 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
590 Heps = _mm256_mul_ps(vfeps,H);
591 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
592 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
593 velec = _mm256_mul_ps(qq12,VV);
594 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
595 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
597 /* Update potential sum for this i atom from the interaction with this j atom. */
598 velecsum = _mm256_add_ps(velecsum,velec);
602 /* Calculate temporary vectorial force */
603 tx = _mm256_mul_ps(fscal,dx12);
604 ty = _mm256_mul_ps(fscal,dy12);
605 tz = _mm256_mul_ps(fscal,dz12);
607 /* Update vectorial force */
608 fix1 = _mm256_add_ps(fix1,tx);
609 fiy1 = _mm256_add_ps(fiy1,ty);
610 fiz1 = _mm256_add_ps(fiz1,tz);
612 fjx2 = _mm256_add_ps(fjx2,tx);
613 fjy2 = _mm256_add_ps(fjy2,ty);
614 fjz2 = _mm256_add_ps(fjz2,tz);
616 /**************************
617 * CALCULATE INTERACTIONS *
618 **************************/
620 r20 = _mm256_mul_ps(rsq20,rinv20);
622 /* Calculate table index by multiplying r with table scale and truncate to integer */
623 rt = _mm256_mul_ps(r20,vftabscale);
624 vfitab = _mm256_cvttps_epi32(rt);
625 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
626 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
627 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
628 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
629 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
630 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
632 /* CUBIC SPLINE TABLE ELECTROSTATICS */
633 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
634 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
635 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
636 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
637 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
638 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
639 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
640 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
641 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
642 Heps = _mm256_mul_ps(vfeps,H);
643 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
644 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
645 velec = _mm256_mul_ps(qq20,VV);
646 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
647 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
649 /* Update potential sum for this i atom from the interaction with this j atom. */
650 velecsum = _mm256_add_ps(velecsum,velec);
654 /* Calculate temporary vectorial force */
655 tx = _mm256_mul_ps(fscal,dx20);
656 ty = _mm256_mul_ps(fscal,dy20);
657 tz = _mm256_mul_ps(fscal,dz20);
659 /* Update vectorial force */
660 fix2 = _mm256_add_ps(fix2,tx);
661 fiy2 = _mm256_add_ps(fiy2,ty);
662 fiz2 = _mm256_add_ps(fiz2,tz);
664 fjx0 = _mm256_add_ps(fjx0,tx);
665 fjy0 = _mm256_add_ps(fjy0,ty);
666 fjz0 = _mm256_add_ps(fjz0,tz);
668 /**************************
669 * CALCULATE INTERACTIONS *
670 **************************/
672 r21 = _mm256_mul_ps(rsq21,rinv21);
674 /* Calculate table index by multiplying r with table scale and truncate to integer */
675 rt = _mm256_mul_ps(r21,vftabscale);
676 vfitab = _mm256_cvttps_epi32(rt);
677 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
678 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
679 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
680 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
681 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
682 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
684 /* CUBIC SPLINE TABLE ELECTROSTATICS */
685 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
686 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
687 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
688 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
689 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
690 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
691 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
692 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
693 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
694 Heps = _mm256_mul_ps(vfeps,H);
695 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
696 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
697 velec = _mm256_mul_ps(qq21,VV);
698 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
699 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
701 /* Update potential sum for this i atom from the interaction with this j atom. */
702 velecsum = _mm256_add_ps(velecsum,velec);
706 /* Calculate temporary vectorial force */
707 tx = _mm256_mul_ps(fscal,dx21);
708 ty = _mm256_mul_ps(fscal,dy21);
709 tz = _mm256_mul_ps(fscal,dz21);
711 /* Update vectorial force */
712 fix2 = _mm256_add_ps(fix2,tx);
713 fiy2 = _mm256_add_ps(fiy2,ty);
714 fiz2 = _mm256_add_ps(fiz2,tz);
716 fjx1 = _mm256_add_ps(fjx1,tx);
717 fjy1 = _mm256_add_ps(fjy1,ty);
718 fjz1 = _mm256_add_ps(fjz1,tz);
720 /**************************
721 * CALCULATE INTERACTIONS *
722 **************************/
724 r22 = _mm256_mul_ps(rsq22,rinv22);
726 /* Calculate table index by multiplying r with table scale and truncate to integer */
727 rt = _mm256_mul_ps(r22,vftabscale);
728 vfitab = _mm256_cvttps_epi32(rt);
729 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
730 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
731 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
732 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
733 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
734 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
736 /* CUBIC SPLINE TABLE ELECTROSTATICS */
737 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
738 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
739 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
740 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
741 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
742 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
743 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
744 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
745 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
746 Heps = _mm256_mul_ps(vfeps,H);
747 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
748 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
749 velec = _mm256_mul_ps(qq22,VV);
750 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
751 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
753 /* Update potential sum for this i atom from the interaction with this j atom. */
754 velecsum = _mm256_add_ps(velecsum,velec);
758 /* Calculate temporary vectorial force */
759 tx = _mm256_mul_ps(fscal,dx22);
760 ty = _mm256_mul_ps(fscal,dy22);
761 tz = _mm256_mul_ps(fscal,dz22);
763 /* Update vectorial force */
764 fix2 = _mm256_add_ps(fix2,tx);
765 fiy2 = _mm256_add_ps(fiy2,ty);
766 fiz2 = _mm256_add_ps(fiz2,tz);
768 fjx2 = _mm256_add_ps(fjx2,tx);
769 fjy2 = _mm256_add_ps(fjy2,ty);
770 fjz2 = _mm256_add_ps(fjz2,tz);
772 fjptrA = f+j_coord_offsetA;
773 fjptrB = f+j_coord_offsetB;
774 fjptrC = f+j_coord_offsetC;
775 fjptrD = f+j_coord_offsetD;
776 fjptrE = f+j_coord_offsetE;
777 fjptrF = f+j_coord_offsetF;
778 fjptrG = f+j_coord_offsetG;
779 fjptrH = f+j_coord_offsetH;
781 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
782 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
784 /* Inner loop uses 400 flops */
790 /* Get j neighbor index, and coordinate index */
791 jnrlistA = jjnr[jidx];
792 jnrlistB = jjnr[jidx+1];
793 jnrlistC = jjnr[jidx+2];
794 jnrlistD = jjnr[jidx+3];
795 jnrlistE = jjnr[jidx+4];
796 jnrlistF = jjnr[jidx+5];
797 jnrlistG = jjnr[jidx+6];
798 jnrlistH = jjnr[jidx+7];
799 /* Sign of each element will be negative for non-real atoms.
800 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
801 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
803 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
804 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
806 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
807 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
808 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
809 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
810 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
811 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
812 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
813 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
814 j_coord_offsetA = DIM*jnrA;
815 j_coord_offsetB = DIM*jnrB;
816 j_coord_offsetC = DIM*jnrC;
817 j_coord_offsetD = DIM*jnrD;
818 j_coord_offsetE = DIM*jnrE;
819 j_coord_offsetF = DIM*jnrF;
820 j_coord_offsetG = DIM*jnrG;
821 j_coord_offsetH = DIM*jnrH;
823 /* load j atom coordinates */
824 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
825 x+j_coord_offsetC,x+j_coord_offsetD,
826 x+j_coord_offsetE,x+j_coord_offsetF,
827 x+j_coord_offsetG,x+j_coord_offsetH,
828 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
830 /* Calculate displacement vector */
831 dx00 = _mm256_sub_ps(ix0,jx0);
832 dy00 = _mm256_sub_ps(iy0,jy0);
833 dz00 = _mm256_sub_ps(iz0,jz0);
834 dx01 = _mm256_sub_ps(ix0,jx1);
835 dy01 = _mm256_sub_ps(iy0,jy1);
836 dz01 = _mm256_sub_ps(iz0,jz1);
837 dx02 = _mm256_sub_ps(ix0,jx2);
838 dy02 = _mm256_sub_ps(iy0,jy2);
839 dz02 = _mm256_sub_ps(iz0,jz2);
840 dx10 = _mm256_sub_ps(ix1,jx0);
841 dy10 = _mm256_sub_ps(iy1,jy0);
842 dz10 = _mm256_sub_ps(iz1,jz0);
843 dx11 = _mm256_sub_ps(ix1,jx1);
844 dy11 = _mm256_sub_ps(iy1,jy1);
845 dz11 = _mm256_sub_ps(iz1,jz1);
846 dx12 = _mm256_sub_ps(ix1,jx2);
847 dy12 = _mm256_sub_ps(iy1,jy2);
848 dz12 = _mm256_sub_ps(iz1,jz2);
849 dx20 = _mm256_sub_ps(ix2,jx0);
850 dy20 = _mm256_sub_ps(iy2,jy0);
851 dz20 = _mm256_sub_ps(iz2,jz0);
852 dx21 = _mm256_sub_ps(ix2,jx1);
853 dy21 = _mm256_sub_ps(iy2,jy1);
854 dz21 = _mm256_sub_ps(iz2,jz1);
855 dx22 = _mm256_sub_ps(ix2,jx2);
856 dy22 = _mm256_sub_ps(iy2,jy2);
857 dz22 = _mm256_sub_ps(iz2,jz2);
859 /* Calculate squared distance and things based on it */
860 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
861 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
862 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
863 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
864 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
865 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
866 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
867 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
868 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
870 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
871 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
872 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
873 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
874 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
875 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
876 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
877 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
878 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
880 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
882 fjx0 = _mm256_setzero_ps();
883 fjy0 = _mm256_setzero_ps();
884 fjz0 = _mm256_setzero_ps();
885 fjx1 = _mm256_setzero_ps();
886 fjy1 = _mm256_setzero_ps();
887 fjz1 = _mm256_setzero_ps();
888 fjx2 = _mm256_setzero_ps();
889 fjy2 = _mm256_setzero_ps();
890 fjz2 = _mm256_setzero_ps();
892 /**************************
893 * CALCULATE INTERACTIONS *
894 **************************/
896 r00 = _mm256_mul_ps(rsq00,rinv00);
897 r00 = _mm256_andnot_ps(dummy_mask,r00);
899 /* Calculate table index by multiplying r with table scale and truncate to integer */
900 rt = _mm256_mul_ps(r00,vftabscale);
901 vfitab = _mm256_cvttps_epi32(rt);
902 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
903 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
904 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
905 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
906 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
907 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
909 /* CUBIC SPLINE TABLE ELECTROSTATICS */
910 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
911 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
912 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
913 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
914 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
915 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
916 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
917 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
918 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
919 Heps = _mm256_mul_ps(vfeps,H);
920 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
921 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
922 velec = _mm256_mul_ps(qq00,VV);
923 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
924 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
926 /* LENNARD-JONES DISPERSION/REPULSION */
928 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
929 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
930 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
931 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
932 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
934 /* Update potential sum for this i atom from the interaction with this j atom. */
935 velec = _mm256_andnot_ps(dummy_mask,velec);
936 velecsum = _mm256_add_ps(velecsum,velec);
937 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
938 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
940 fscal = _mm256_add_ps(felec,fvdw);
942 fscal = _mm256_andnot_ps(dummy_mask,fscal);
944 /* Calculate temporary vectorial force */
945 tx = _mm256_mul_ps(fscal,dx00);
946 ty = _mm256_mul_ps(fscal,dy00);
947 tz = _mm256_mul_ps(fscal,dz00);
949 /* Update vectorial force */
950 fix0 = _mm256_add_ps(fix0,tx);
951 fiy0 = _mm256_add_ps(fiy0,ty);
952 fiz0 = _mm256_add_ps(fiz0,tz);
954 fjx0 = _mm256_add_ps(fjx0,tx);
955 fjy0 = _mm256_add_ps(fjy0,ty);
956 fjz0 = _mm256_add_ps(fjz0,tz);
958 /**************************
959 * CALCULATE INTERACTIONS *
960 **************************/
962 r01 = _mm256_mul_ps(rsq01,rinv01);
963 r01 = _mm256_andnot_ps(dummy_mask,r01);
965 /* Calculate table index by multiplying r with table scale and truncate to integer */
966 rt = _mm256_mul_ps(r01,vftabscale);
967 vfitab = _mm256_cvttps_epi32(rt);
968 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
969 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
970 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
971 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
972 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
973 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
975 /* CUBIC SPLINE TABLE ELECTROSTATICS */
976 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
977 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
978 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
979 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
980 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
981 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
982 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
983 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
984 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
985 Heps = _mm256_mul_ps(vfeps,H);
986 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
987 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
988 velec = _mm256_mul_ps(qq01,VV);
989 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
990 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
992 /* Update potential sum for this i atom from the interaction with this j atom. */
993 velec = _mm256_andnot_ps(dummy_mask,velec);
994 velecsum = _mm256_add_ps(velecsum,velec);
998 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1000 /* Calculate temporary vectorial force */
1001 tx = _mm256_mul_ps(fscal,dx01);
1002 ty = _mm256_mul_ps(fscal,dy01);
1003 tz = _mm256_mul_ps(fscal,dz01);
1005 /* Update vectorial force */
1006 fix0 = _mm256_add_ps(fix0,tx);
1007 fiy0 = _mm256_add_ps(fiy0,ty);
1008 fiz0 = _mm256_add_ps(fiz0,tz);
1010 fjx1 = _mm256_add_ps(fjx1,tx);
1011 fjy1 = _mm256_add_ps(fjy1,ty);
1012 fjz1 = _mm256_add_ps(fjz1,tz);
1014 /**************************
1015 * CALCULATE INTERACTIONS *
1016 **************************/
1018 r02 = _mm256_mul_ps(rsq02,rinv02);
1019 r02 = _mm256_andnot_ps(dummy_mask,r02);
1021 /* Calculate table index by multiplying r with table scale and truncate to integer */
1022 rt = _mm256_mul_ps(r02,vftabscale);
1023 vfitab = _mm256_cvttps_epi32(rt);
1024 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1025 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1026 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1027 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1028 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1029 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1031 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1032 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1033 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1034 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1035 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1036 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1037 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1038 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1039 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1040 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1041 Heps = _mm256_mul_ps(vfeps,H);
1042 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1043 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1044 velec = _mm256_mul_ps(qq02,VV);
1045 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1046 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1048 /* Update potential sum for this i atom from the interaction with this j atom. */
1049 velec = _mm256_andnot_ps(dummy_mask,velec);
1050 velecsum = _mm256_add_ps(velecsum,velec);
1054 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1056 /* Calculate temporary vectorial force */
1057 tx = _mm256_mul_ps(fscal,dx02);
1058 ty = _mm256_mul_ps(fscal,dy02);
1059 tz = _mm256_mul_ps(fscal,dz02);
1061 /* Update vectorial force */
1062 fix0 = _mm256_add_ps(fix0,tx);
1063 fiy0 = _mm256_add_ps(fiy0,ty);
1064 fiz0 = _mm256_add_ps(fiz0,tz);
1066 fjx2 = _mm256_add_ps(fjx2,tx);
1067 fjy2 = _mm256_add_ps(fjy2,ty);
1068 fjz2 = _mm256_add_ps(fjz2,tz);
1070 /**************************
1071 * CALCULATE INTERACTIONS *
1072 **************************/
1074 r10 = _mm256_mul_ps(rsq10,rinv10);
1075 r10 = _mm256_andnot_ps(dummy_mask,r10);
1077 /* Calculate table index by multiplying r with table scale and truncate to integer */
1078 rt = _mm256_mul_ps(r10,vftabscale);
1079 vfitab = _mm256_cvttps_epi32(rt);
1080 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1081 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1082 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1083 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1084 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1085 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1087 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1088 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1089 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1090 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1091 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1092 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1093 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1094 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1095 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1096 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1097 Heps = _mm256_mul_ps(vfeps,H);
1098 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1099 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1100 velec = _mm256_mul_ps(qq10,VV);
1101 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1102 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1104 /* Update potential sum for this i atom from the interaction with this j atom. */
1105 velec = _mm256_andnot_ps(dummy_mask,velec);
1106 velecsum = _mm256_add_ps(velecsum,velec);
1110 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1112 /* Calculate temporary vectorial force */
1113 tx = _mm256_mul_ps(fscal,dx10);
1114 ty = _mm256_mul_ps(fscal,dy10);
1115 tz = _mm256_mul_ps(fscal,dz10);
1117 /* Update vectorial force */
1118 fix1 = _mm256_add_ps(fix1,tx);
1119 fiy1 = _mm256_add_ps(fiy1,ty);
1120 fiz1 = _mm256_add_ps(fiz1,tz);
1122 fjx0 = _mm256_add_ps(fjx0,tx);
1123 fjy0 = _mm256_add_ps(fjy0,ty);
1124 fjz0 = _mm256_add_ps(fjz0,tz);
1126 /**************************
1127 * CALCULATE INTERACTIONS *
1128 **************************/
1130 r11 = _mm256_mul_ps(rsq11,rinv11);
1131 r11 = _mm256_andnot_ps(dummy_mask,r11);
1133 /* Calculate table index by multiplying r with table scale and truncate to integer */
1134 rt = _mm256_mul_ps(r11,vftabscale);
1135 vfitab = _mm256_cvttps_epi32(rt);
1136 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1137 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1138 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1139 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1140 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1141 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1143 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1144 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1145 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1146 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1147 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1148 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1149 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1150 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1151 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1152 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1153 Heps = _mm256_mul_ps(vfeps,H);
1154 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1155 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1156 velec = _mm256_mul_ps(qq11,VV);
1157 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1158 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1160 /* Update potential sum for this i atom from the interaction with this j atom. */
1161 velec = _mm256_andnot_ps(dummy_mask,velec);
1162 velecsum = _mm256_add_ps(velecsum,velec);
1166 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1168 /* Calculate temporary vectorial force */
1169 tx = _mm256_mul_ps(fscal,dx11);
1170 ty = _mm256_mul_ps(fscal,dy11);
1171 tz = _mm256_mul_ps(fscal,dz11);
1173 /* Update vectorial force */
1174 fix1 = _mm256_add_ps(fix1,tx);
1175 fiy1 = _mm256_add_ps(fiy1,ty);
1176 fiz1 = _mm256_add_ps(fiz1,tz);
1178 fjx1 = _mm256_add_ps(fjx1,tx);
1179 fjy1 = _mm256_add_ps(fjy1,ty);
1180 fjz1 = _mm256_add_ps(fjz1,tz);
1182 /**************************
1183 * CALCULATE INTERACTIONS *
1184 **************************/
1186 r12 = _mm256_mul_ps(rsq12,rinv12);
1187 r12 = _mm256_andnot_ps(dummy_mask,r12);
1189 /* Calculate table index by multiplying r with table scale and truncate to integer */
1190 rt = _mm256_mul_ps(r12,vftabscale);
1191 vfitab = _mm256_cvttps_epi32(rt);
1192 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1193 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1194 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1195 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1196 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1197 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1199 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1200 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1201 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1202 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1203 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1204 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1205 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1206 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1207 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1208 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1209 Heps = _mm256_mul_ps(vfeps,H);
1210 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1211 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1212 velec = _mm256_mul_ps(qq12,VV);
1213 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1214 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1216 /* Update potential sum for this i atom from the interaction with this j atom. */
1217 velec = _mm256_andnot_ps(dummy_mask,velec);
1218 velecsum = _mm256_add_ps(velecsum,velec);
1222 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1224 /* Calculate temporary vectorial force */
1225 tx = _mm256_mul_ps(fscal,dx12);
1226 ty = _mm256_mul_ps(fscal,dy12);
1227 tz = _mm256_mul_ps(fscal,dz12);
1229 /* Update vectorial force */
1230 fix1 = _mm256_add_ps(fix1,tx);
1231 fiy1 = _mm256_add_ps(fiy1,ty);
1232 fiz1 = _mm256_add_ps(fiz1,tz);
1234 fjx2 = _mm256_add_ps(fjx2,tx);
1235 fjy2 = _mm256_add_ps(fjy2,ty);
1236 fjz2 = _mm256_add_ps(fjz2,tz);
1238 /**************************
1239 * CALCULATE INTERACTIONS *
1240 **************************/
1242 r20 = _mm256_mul_ps(rsq20,rinv20);
1243 r20 = _mm256_andnot_ps(dummy_mask,r20);
1245 /* Calculate table index by multiplying r with table scale and truncate to integer */
1246 rt = _mm256_mul_ps(r20,vftabscale);
1247 vfitab = _mm256_cvttps_epi32(rt);
1248 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1249 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1250 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1251 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1252 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1253 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1255 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1256 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1257 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1258 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1259 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1260 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1261 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1262 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1263 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1264 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1265 Heps = _mm256_mul_ps(vfeps,H);
1266 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1267 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1268 velec = _mm256_mul_ps(qq20,VV);
1269 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1270 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1272 /* Update potential sum for this i atom from the interaction with this j atom. */
1273 velec = _mm256_andnot_ps(dummy_mask,velec);
1274 velecsum = _mm256_add_ps(velecsum,velec);
1278 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1280 /* Calculate temporary vectorial force */
1281 tx = _mm256_mul_ps(fscal,dx20);
1282 ty = _mm256_mul_ps(fscal,dy20);
1283 tz = _mm256_mul_ps(fscal,dz20);
1285 /* Update vectorial force */
1286 fix2 = _mm256_add_ps(fix2,tx);
1287 fiy2 = _mm256_add_ps(fiy2,ty);
1288 fiz2 = _mm256_add_ps(fiz2,tz);
1290 fjx0 = _mm256_add_ps(fjx0,tx);
1291 fjy0 = _mm256_add_ps(fjy0,ty);
1292 fjz0 = _mm256_add_ps(fjz0,tz);
1294 /**************************
1295 * CALCULATE INTERACTIONS *
1296 **************************/
1298 r21 = _mm256_mul_ps(rsq21,rinv21);
1299 r21 = _mm256_andnot_ps(dummy_mask,r21);
1301 /* Calculate table index by multiplying r with table scale and truncate to integer */
1302 rt = _mm256_mul_ps(r21,vftabscale);
1303 vfitab = _mm256_cvttps_epi32(rt);
1304 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1305 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1306 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1307 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1308 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1309 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1311 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1312 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1313 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1314 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1315 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1316 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1317 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1318 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1319 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1320 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1321 Heps = _mm256_mul_ps(vfeps,H);
1322 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1323 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1324 velec = _mm256_mul_ps(qq21,VV);
1325 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1326 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1328 /* Update potential sum for this i atom from the interaction with this j atom. */
1329 velec = _mm256_andnot_ps(dummy_mask,velec);
1330 velecsum = _mm256_add_ps(velecsum,velec);
1334 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1336 /* Calculate temporary vectorial force */
1337 tx = _mm256_mul_ps(fscal,dx21);
1338 ty = _mm256_mul_ps(fscal,dy21);
1339 tz = _mm256_mul_ps(fscal,dz21);
1341 /* Update vectorial force */
1342 fix2 = _mm256_add_ps(fix2,tx);
1343 fiy2 = _mm256_add_ps(fiy2,ty);
1344 fiz2 = _mm256_add_ps(fiz2,tz);
1346 fjx1 = _mm256_add_ps(fjx1,tx);
1347 fjy1 = _mm256_add_ps(fjy1,ty);
1348 fjz1 = _mm256_add_ps(fjz1,tz);
1350 /**************************
1351 * CALCULATE INTERACTIONS *
1352 **************************/
1354 r22 = _mm256_mul_ps(rsq22,rinv22);
1355 r22 = _mm256_andnot_ps(dummy_mask,r22);
1357 /* Calculate table index by multiplying r with table scale and truncate to integer */
1358 rt = _mm256_mul_ps(r22,vftabscale);
1359 vfitab = _mm256_cvttps_epi32(rt);
1360 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1361 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1362 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1363 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1364 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1365 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1367 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1368 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1369 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1370 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1371 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1372 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1373 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1374 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1375 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1376 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1377 Heps = _mm256_mul_ps(vfeps,H);
1378 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1379 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1380 velec = _mm256_mul_ps(qq22,VV);
1381 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1382 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1384 /* Update potential sum for this i atom from the interaction with this j atom. */
1385 velec = _mm256_andnot_ps(dummy_mask,velec);
1386 velecsum = _mm256_add_ps(velecsum,velec);
1390 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1392 /* Calculate temporary vectorial force */
1393 tx = _mm256_mul_ps(fscal,dx22);
1394 ty = _mm256_mul_ps(fscal,dy22);
1395 tz = _mm256_mul_ps(fscal,dz22);
1397 /* Update vectorial force */
1398 fix2 = _mm256_add_ps(fix2,tx);
1399 fiy2 = _mm256_add_ps(fiy2,ty);
1400 fiz2 = _mm256_add_ps(fiz2,tz);
1402 fjx2 = _mm256_add_ps(fjx2,tx);
1403 fjy2 = _mm256_add_ps(fjy2,ty);
1404 fjz2 = _mm256_add_ps(fjz2,tz);
1406 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1407 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1408 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1409 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1410 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1411 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1412 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1413 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1415 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1416 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1418 /* Inner loop uses 409 flops */
1421 /* End of innermost loop */
1423 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1424 f+i_coord_offset,fshift+i_shift_offset);
1427 /* Update potential energies */
1428 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1429 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1431 /* Increment number of inner iterations */
1432 inneriter += j_index_end - j_index_start;
1434 /* Outer loop uses 20 flops */
1437 /* Increment number of outer iterations */
1440 /* Update outer/inner flops */
1442 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1445 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single
1446 * Electrostatics interaction: CubicSplineTable
1447 * VdW interaction: LennardJones
1448 * Geometry: Water3-Water3
1449 * Calculate force/pot: Force
1452 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single
1453 (t_nblist * gmx_restrict nlist,
1454 rvec * gmx_restrict xx,
1455 rvec * gmx_restrict ff,
1456 t_forcerec * gmx_restrict fr,
1457 t_mdatoms * gmx_restrict mdatoms,
1458 nb_kernel_data_t * gmx_restrict kernel_data,
1459 t_nrnb * gmx_restrict nrnb)
1461 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1462 * just 0 for non-waters.
1463 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1464 * jnr indices corresponding to data put in the four positions in the SIMD register.
1466 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1467 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1468 int jnrA,jnrB,jnrC,jnrD;
1469 int jnrE,jnrF,jnrG,jnrH;
1470 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1471 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1472 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1473 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1474 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1475 real rcutoff_scalar;
1476 real *shiftvec,*fshift,*x,*f;
1477 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1478 real scratch[4*DIM];
1479 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1480 real * vdwioffsetptr0;
1481 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1482 real * vdwioffsetptr1;
1483 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1484 real * vdwioffsetptr2;
1485 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1486 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1487 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1488 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1489 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1490 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1491 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1492 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1493 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1494 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1495 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1496 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1497 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1498 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1499 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1500 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1501 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1504 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1507 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1508 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1510 __m128i vfitab_lo,vfitab_hi;
1511 __m128i ifour = _mm_set1_epi32(4);
1512 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1514 __m256 dummy_mask,cutoff_mask;
1515 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1516 __m256 one = _mm256_set1_ps(1.0);
1517 __m256 two = _mm256_set1_ps(2.0);
1523 jindex = nlist->jindex;
1525 shiftidx = nlist->shift;
1527 shiftvec = fr->shift_vec[0];
1528 fshift = fr->fshift[0];
1529 facel = _mm256_set1_ps(fr->epsfac);
1530 charge = mdatoms->chargeA;
1531 nvdwtype = fr->ntype;
1532 vdwparam = fr->nbfp;
1533 vdwtype = mdatoms->typeA;
1535 vftab = kernel_data->table_elec->data;
1536 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
1538 /* Setup water-specific parameters */
1539 inr = nlist->iinr[0];
1540 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1541 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1542 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1543 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1545 jq0 = _mm256_set1_ps(charge[inr+0]);
1546 jq1 = _mm256_set1_ps(charge[inr+1]);
1547 jq2 = _mm256_set1_ps(charge[inr+2]);
1548 vdwjidx0A = 2*vdwtype[inr+0];
1549 qq00 = _mm256_mul_ps(iq0,jq0);
1550 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1551 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1552 qq01 = _mm256_mul_ps(iq0,jq1);
1553 qq02 = _mm256_mul_ps(iq0,jq2);
1554 qq10 = _mm256_mul_ps(iq1,jq0);
1555 qq11 = _mm256_mul_ps(iq1,jq1);
1556 qq12 = _mm256_mul_ps(iq1,jq2);
1557 qq20 = _mm256_mul_ps(iq2,jq0);
1558 qq21 = _mm256_mul_ps(iq2,jq1);
1559 qq22 = _mm256_mul_ps(iq2,jq2);
1561 /* Avoid stupid compiler warnings */
1562 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1563 j_coord_offsetA = 0;
1564 j_coord_offsetB = 0;
1565 j_coord_offsetC = 0;
1566 j_coord_offsetD = 0;
1567 j_coord_offsetE = 0;
1568 j_coord_offsetF = 0;
1569 j_coord_offsetG = 0;
1570 j_coord_offsetH = 0;
1575 for(iidx=0;iidx<4*DIM;iidx++)
1577 scratch[iidx] = 0.0;
1580 /* Start outer loop over neighborlists */
1581 for(iidx=0; iidx<nri; iidx++)
1583 /* Load shift vector for this list */
1584 i_shift_offset = DIM*shiftidx[iidx];
1586 /* Load limits for loop over neighbors */
1587 j_index_start = jindex[iidx];
1588 j_index_end = jindex[iidx+1];
1590 /* Get outer coordinate index */
1592 i_coord_offset = DIM*inr;
1594 /* Load i particle coords and add shift vector */
1595 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1596 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1598 fix0 = _mm256_setzero_ps();
1599 fiy0 = _mm256_setzero_ps();
1600 fiz0 = _mm256_setzero_ps();
1601 fix1 = _mm256_setzero_ps();
1602 fiy1 = _mm256_setzero_ps();
1603 fiz1 = _mm256_setzero_ps();
1604 fix2 = _mm256_setzero_ps();
1605 fiy2 = _mm256_setzero_ps();
1606 fiz2 = _mm256_setzero_ps();
1608 /* Start inner kernel loop */
1609 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1612 /* Get j neighbor index, and coordinate index */
1614 jnrB = jjnr[jidx+1];
1615 jnrC = jjnr[jidx+2];
1616 jnrD = jjnr[jidx+3];
1617 jnrE = jjnr[jidx+4];
1618 jnrF = jjnr[jidx+5];
1619 jnrG = jjnr[jidx+6];
1620 jnrH = jjnr[jidx+7];
1621 j_coord_offsetA = DIM*jnrA;
1622 j_coord_offsetB = DIM*jnrB;
1623 j_coord_offsetC = DIM*jnrC;
1624 j_coord_offsetD = DIM*jnrD;
1625 j_coord_offsetE = DIM*jnrE;
1626 j_coord_offsetF = DIM*jnrF;
1627 j_coord_offsetG = DIM*jnrG;
1628 j_coord_offsetH = DIM*jnrH;
1630 /* load j atom coordinates */
1631 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1632 x+j_coord_offsetC,x+j_coord_offsetD,
1633 x+j_coord_offsetE,x+j_coord_offsetF,
1634 x+j_coord_offsetG,x+j_coord_offsetH,
1635 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1637 /* Calculate displacement vector */
1638 dx00 = _mm256_sub_ps(ix0,jx0);
1639 dy00 = _mm256_sub_ps(iy0,jy0);
1640 dz00 = _mm256_sub_ps(iz0,jz0);
1641 dx01 = _mm256_sub_ps(ix0,jx1);
1642 dy01 = _mm256_sub_ps(iy0,jy1);
1643 dz01 = _mm256_sub_ps(iz0,jz1);
1644 dx02 = _mm256_sub_ps(ix0,jx2);
1645 dy02 = _mm256_sub_ps(iy0,jy2);
1646 dz02 = _mm256_sub_ps(iz0,jz2);
1647 dx10 = _mm256_sub_ps(ix1,jx0);
1648 dy10 = _mm256_sub_ps(iy1,jy0);
1649 dz10 = _mm256_sub_ps(iz1,jz0);
1650 dx11 = _mm256_sub_ps(ix1,jx1);
1651 dy11 = _mm256_sub_ps(iy1,jy1);
1652 dz11 = _mm256_sub_ps(iz1,jz1);
1653 dx12 = _mm256_sub_ps(ix1,jx2);
1654 dy12 = _mm256_sub_ps(iy1,jy2);
1655 dz12 = _mm256_sub_ps(iz1,jz2);
1656 dx20 = _mm256_sub_ps(ix2,jx0);
1657 dy20 = _mm256_sub_ps(iy2,jy0);
1658 dz20 = _mm256_sub_ps(iz2,jz0);
1659 dx21 = _mm256_sub_ps(ix2,jx1);
1660 dy21 = _mm256_sub_ps(iy2,jy1);
1661 dz21 = _mm256_sub_ps(iz2,jz1);
1662 dx22 = _mm256_sub_ps(ix2,jx2);
1663 dy22 = _mm256_sub_ps(iy2,jy2);
1664 dz22 = _mm256_sub_ps(iz2,jz2);
1666 /* Calculate squared distance and things based on it */
1667 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1668 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1669 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1670 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1671 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1672 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1673 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1674 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1675 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1677 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1678 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1679 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1680 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1681 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1682 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1683 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1684 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1685 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1687 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1689 fjx0 = _mm256_setzero_ps();
1690 fjy0 = _mm256_setzero_ps();
1691 fjz0 = _mm256_setzero_ps();
1692 fjx1 = _mm256_setzero_ps();
1693 fjy1 = _mm256_setzero_ps();
1694 fjz1 = _mm256_setzero_ps();
1695 fjx2 = _mm256_setzero_ps();
1696 fjy2 = _mm256_setzero_ps();
1697 fjz2 = _mm256_setzero_ps();
1699 /**************************
1700 * CALCULATE INTERACTIONS *
1701 **************************/
1703 r00 = _mm256_mul_ps(rsq00,rinv00);
1705 /* Calculate table index by multiplying r with table scale and truncate to integer */
1706 rt = _mm256_mul_ps(r00,vftabscale);
1707 vfitab = _mm256_cvttps_epi32(rt);
1708 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1709 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1710 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1711 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1712 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1713 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1715 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1716 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1717 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1718 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1719 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1720 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1721 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1722 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1723 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1724 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1725 Heps = _mm256_mul_ps(vfeps,H);
1726 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1727 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1728 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1730 /* LENNARD-JONES DISPERSION/REPULSION */
1732 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1733 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1735 fscal = _mm256_add_ps(felec,fvdw);
1737 /* Calculate temporary vectorial force */
1738 tx = _mm256_mul_ps(fscal,dx00);
1739 ty = _mm256_mul_ps(fscal,dy00);
1740 tz = _mm256_mul_ps(fscal,dz00);
1742 /* Update vectorial force */
1743 fix0 = _mm256_add_ps(fix0,tx);
1744 fiy0 = _mm256_add_ps(fiy0,ty);
1745 fiz0 = _mm256_add_ps(fiz0,tz);
1747 fjx0 = _mm256_add_ps(fjx0,tx);
1748 fjy0 = _mm256_add_ps(fjy0,ty);
1749 fjz0 = _mm256_add_ps(fjz0,tz);
1751 /**************************
1752 * CALCULATE INTERACTIONS *
1753 **************************/
1755 r01 = _mm256_mul_ps(rsq01,rinv01);
1757 /* Calculate table index by multiplying r with table scale and truncate to integer */
1758 rt = _mm256_mul_ps(r01,vftabscale);
1759 vfitab = _mm256_cvttps_epi32(rt);
1760 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1761 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1762 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1763 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1764 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1765 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1767 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1768 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1769 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1770 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1771 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1772 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1773 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1774 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1775 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1776 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1777 Heps = _mm256_mul_ps(vfeps,H);
1778 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1779 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1780 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1784 /* Calculate temporary vectorial force */
1785 tx = _mm256_mul_ps(fscal,dx01);
1786 ty = _mm256_mul_ps(fscal,dy01);
1787 tz = _mm256_mul_ps(fscal,dz01);
1789 /* Update vectorial force */
1790 fix0 = _mm256_add_ps(fix0,tx);
1791 fiy0 = _mm256_add_ps(fiy0,ty);
1792 fiz0 = _mm256_add_ps(fiz0,tz);
1794 fjx1 = _mm256_add_ps(fjx1,tx);
1795 fjy1 = _mm256_add_ps(fjy1,ty);
1796 fjz1 = _mm256_add_ps(fjz1,tz);
1798 /**************************
1799 * CALCULATE INTERACTIONS *
1800 **************************/
1802 r02 = _mm256_mul_ps(rsq02,rinv02);
1804 /* Calculate table index by multiplying r with table scale and truncate to integer */
1805 rt = _mm256_mul_ps(r02,vftabscale);
1806 vfitab = _mm256_cvttps_epi32(rt);
1807 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1808 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1809 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1810 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1811 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1812 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1814 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1815 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1816 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1817 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1818 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1819 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1820 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1821 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1822 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1823 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1824 Heps = _mm256_mul_ps(vfeps,H);
1825 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1826 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1827 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1831 /* Calculate temporary vectorial force */
1832 tx = _mm256_mul_ps(fscal,dx02);
1833 ty = _mm256_mul_ps(fscal,dy02);
1834 tz = _mm256_mul_ps(fscal,dz02);
1836 /* Update vectorial force */
1837 fix0 = _mm256_add_ps(fix0,tx);
1838 fiy0 = _mm256_add_ps(fiy0,ty);
1839 fiz0 = _mm256_add_ps(fiz0,tz);
1841 fjx2 = _mm256_add_ps(fjx2,tx);
1842 fjy2 = _mm256_add_ps(fjy2,ty);
1843 fjz2 = _mm256_add_ps(fjz2,tz);
1845 /**************************
1846 * CALCULATE INTERACTIONS *
1847 **************************/
1849 r10 = _mm256_mul_ps(rsq10,rinv10);
1851 /* Calculate table index by multiplying r with table scale and truncate to integer */
1852 rt = _mm256_mul_ps(r10,vftabscale);
1853 vfitab = _mm256_cvttps_epi32(rt);
1854 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1855 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1856 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1857 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1858 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1859 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1861 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1862 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1863 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1864 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1865 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1866 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1867 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1868 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1869 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1870 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1871 Heps = _mm256_mul_ps(vfeps,H);
1872 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1873 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1874 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1878 /* Calculate temporary vectorial force */
1879 tx = _mm256_mul_ps(fscal,dx10);
1880 ty = _mm256_mul_ps(fscal,dy10);
1881 tz = _mm256_mul_ps(fscal,dz10);
1883 /* Update vectorial force */
1884 fix1 = _mm256_add_ps(fix1,tx);
1885 fiy1 = _mm256_add_ps(fiy1,ty);
1886 fiz1 = _mm256_add_ps(fiz1,tz);
1888 fjx0 = _mm256_add_ps(fjx0,tx);
1889 fjy0 = _mm256_add_ps(fjy0,ty);
1890 fjz0 = _mm256_add_ps(fjz0,tz);
1892 /**************************
1893 * CALCULATE INTERACTIONS *
1894 **************************/
1896 r11 = _mm256_mul_ps(rsq11,rinv11);
1898 /* Calculate table index by multiplying r with table scale and truncate to integer */
1899 rt = _mm256_mul_ps(r11,vftabscale);
1900 vfitab = _mm256_cvttps_epi32(rt);
1901 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1902 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1903 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1904 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1905 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1906 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1908 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1909 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1910 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1911 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1912 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1913 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1914 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1915 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1916 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1917 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1918 Heps = _mm256_mul_ps(vfeps,H);
1919 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1920 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1921 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1925 /* Calculate temporary vectorial force */
1926 tx = _mm256_mul_ps(fscal,dx11);
1927 ty = _mm256_mul_ps(fscal,dy11);
1928 tz = _mm256_mul_ps(fscal,dz11);
1930 /* Update vectorial force */
1931 fix1 = _mm256_add_ps(fix1,tx);
1932 fiy1 = _mm256_add_ps(fiy1,ty);
1933 fiz1 = _mm256_add_ps(fiz1,tz);
1935 fjx1 = _mm256_add_ps(fjx1,tx);
1936 fjy1 = _mm256_add_ps(fjy1,ty);
1937 fjz1 = _mm256_add_ps(fjz1,tz);
1939 /**************************
1940 * CALCULATE INTERACTIONS *
1941 **************************/
1943 r12 = _mm256_mul_ps(rsq12,rinv12);
1945 /* Calculate table index by multiplying r with table scale and truncate to integer */
1946 rt = _mm256_mul_ps(r12,vftabscale);
1947 vfitab = _mm256_cvttps_epi32(rt);
1948 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1949 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1950 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1951 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1952 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1953 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1955 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1956 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1957 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1958 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1959 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1960 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1961 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1962 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1963 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1964 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1965 Heps = _mm256_mul_ps(vfeps,H);
1966 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1967 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1968 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1972 /* Calculate temporary vectorial force */
1973 tx = _mm256_mul_ps(fscal,dx12);
1974 ty = _mm256_mul_ps(fscal,dy12);
1975 tz = _mm256_mul_ps(fscal,dz12);
1977 /* Update vectorial force */
1978 fix1 = _mm256_add_ps(fix1,tx);
1979 fiy1 = _mm256_add_ps(fiy1,ty);
1980 fiz1 = _mm256_add_ps(fiz1,tz);
1982 fjx2 = _mm256_add_ps(fjx2,tx);
1983 fjy2 = _mm256_add_ps(fjy2,ty);
1984 fjz2 = _mm256_add_ps(fjz2,tz);
1986 /**************************
1987 * CALCULATE INTERACTIONS *
1988 **************************/
1990 r20 = _mm256_mul_ps(rsq20,rinv20);
1992 /* Calculate table index by multiplying r with table scale and truncate to integer */
1993 rt = _mm256_mul_ps(r20,vftabscale);
1994 vfitab = _mm256_cvttps_epi32(rt);
1995 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1996 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1997 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1998 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1999 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2000 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2002 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2003 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2004 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2005 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2006 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2007 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2008 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2009 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2010 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2011 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2012 Heps = _mm256_mul_ps(vfeps,H);
2013 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2014 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2015 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2019 /* Calculate temporary vectorial force */
2020 tx = _mm256_mul_ps(fscal,dx20);
2021 ty = _mm256_mul_ps(fscal,dy20);
2022 tz = _mm256_mul_ps(fscal,dz20);
2024 /* Update vectorial force */
2025 fix2 = _mm256_add_ps(fix2,tx);
2026 fiy2 = _mm256_add_ps(fiy2,ty);
2027 fiz2 = _mm256_add_ps(fiz2,tz);
2029 fjx0 = _mm256_add_ps(fjx0,tx);
2030 fjy0 = _mm256_add_ps(fjy0,ty);
2031 fjz0 = _mm256_add_ps(fjz0,tz);
2033 /**************************
2034 * CALCULATE INTERACTIONS *
2035 **************************/
2037 r21 = _mm256_mul_ps(rsq21,rinv21);
2039 /* Calculate table index by multiplying r with table scale and truncate to integer */
2040 rt = _mm256_mul_ps(r21,vftabscale);
2041 vfitab = _mm256_cvttps_epi32(rt);
2042 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2043 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2044 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2045 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2046 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2047 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2049 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2050 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2051 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2052 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2053 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2054 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2055 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2056 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2057 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2058 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2059 Heps = _mm256_mul_ps(vfeps,H);
2060 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2061 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2062 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2066 /* Calculate temporary vectorial force */
2067 tx = _mm256_mul_ps(fscal,dx21);
2068 ty = _mm256_mul_ps(fscal,dy21);
2069 tz = _mm256_mul_ps(fscal,dz21);
2071 /* Update vectorial force */
2072 fix2 = _mm256_add_ps(fix2,tx);
2073 fiy2 = _mm256_add_ps(fiy2,ty);
2074 fiz2 = _mm256_add_ps(fiz2,tz);
2076 fjx1 = _mm256_add_ps(fjx1,tx);
2077 fjy1 = _mm256_add_ps(fjy1,ty);
2078 fjz1 = _mm256_add_ps(fjz1,tz);
2080 /**************************
2081 * CALCULATE INTERACTIONS *
2082 **************************/
2084 r22 = _mm256_mul_ps(rsq22,rinv22);
2086 /* Calculate table index by multiplying r with table scale and truncate to integer */
2087 rt = _mm256_mul_ps(r22,vftabscale);
2088 vfitab = _mm256_cvttps_epi32(rt);
2089 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2090 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2091 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2092 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2093 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2094 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2096 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2097 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2098 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2099 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2100 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2101 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2102 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2103 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2104 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2105 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2106 Heps = _mm256_mul_ps(vfeps,H);
2107 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2108 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2109 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2113 /* Calculate temporary vectorial force */
2114 tx = _mm256_mul_ps(fscal,dx22);
2115 ty = _mm256_mul_ps(fscal,dy22);
2116 tz = _mm256_mul_ps(fscal,dz22);
2118 /* Update vectorial force */
2119 fix2 = _mm256_add_ps(fix2,tx);
2120 fiy2 = _mm256_add_ps(fiy2,ty);
2121 fiz2 = _mm256_add_ps(fiz2,tz);
2123 fjx2 = _mm256_add_ps(fjx2,tx);
2124 fjy2 = _mm256_add_ps(fjy2,ty);
2125 fjz2 = _mm256_add_ps(fjz2,tz);
2127 fjptrA = f+j_coord_offsetA;
2128 fjptrB = f+j_coord_offsetB;
2129 fjptrC = f+j_coord_offsetC;
2130 fjptrD = f+j_coord_offsetD;
2131 fjptrE = f+j_coord_offsetE;
2132 fjptrF = f+j_coord_offsetF;
2133 fjptrG = f+j_coord_offsetG;
2134 fjptrH = f+j_coord_offsetH;
2136 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2137 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2139 /* Inner loop uses 359 flops */
2142 if(jidx<j_index_end)
2145 /* Get j neighbor index, and coordinate index */
2146 jnrlistA = jjnr[jidx];
2147 jnrlistB = jjnr[jidx+1];
2148 jnrlistC = jjnr[jidx+2];
2149 jnrlistD = jjnr[jidx+3];
2150 jnrlistE = jjnr[jidx+4];
2151 jnrlistF = jjnr[jidx+5];
2152 jnrlistG = jjnr[jidx+6];
2153 jnrlistH = jjnr[jidx+7];
2154 /* Sign of each element will be negative for non-real atoms.
2155 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2156 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2158 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2159 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2161 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2162 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2163 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2164 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2165 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2166 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2167 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2168 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2169 j_coord_offsetA = DIM*jnrA;
2170 j_coord_offsetB = DIM*jnrB;
2171 j_coord_offsetC = DIM*jnrC;
2172 j_coord_offsetD = DIM*jnrD;
2173 j_coord_offsetE = DIM*jnrE;
2174 j_coord_offsetF = DIM*jnrF;
2175 j_coord_offsetG = DIM*jnrG;
2176 j_coord_offsetH = DIM*jnrH;
2178 /* load j atom coordinates */
2179 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2180 x+j_coord_offsetC,x+j_coord_offsetD,
2181 x+j_coord_offsetE,x+j_coord_offsetF,
2182 x+j_coord_offsetG,x+j_coord_offsetH,
2183 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2185 /* Calculate displacement vector */
2186 dx00 = _mm256_sub_ps(ix0,jx0);
2187 dy00 = _mm256_sub_ps(iy0,jy0);
2188 dz00 = _mm256_sub_ps(iz0,jz0);
2189 dx01 = _mm256_sub_ps(ix0,jx1);
2190 dy01 = _mm256_sub_ps(iy0,jy1);
2191 dz01 = _mm256_sub_ps(iz0,jz1);
2192 dx02 = _mm256_sub_ps(ix0,jx2);
2193 dy02 = _mm256_sub_ps(iy0,jy2);
2194 dz02 = _mm256_sub_ps(iz0,jz2);
2195 dx10 = _mm256_sub_ps(ix1,jx0);
2196 dy10 = _mm256_sub_ps(iy1,jy0);
2197 dz10 = _mm256_sub_ps(iz1,jz0);
2198 dx11 = _mm256_sub_ps(ix1,jx1);
2199 dy11 = _mm256_sub_ps(iy1,jy1);
2200 dz11 = _mm256_sub_ps(iz1,jz1);
2201 dx12 = _mm256_sub_ps(ix1,jx2);
2202 dy12 = _mm256_sub_ps(iy1,jy2);
2203 dz12 = _mm256_sub_ps(iz1,jz2);
2204 dx20 = _mm256_sub_ps(ix2,jx0);
2205 dy20 = _mm256_sub_ps(iy2,jy0);
2206 dz20 = _mm256_sub_ps(iz2,jz0);
2207 dx21 = _mm256_sub_ps(ix2,jx1);
2208 dy21 = _mm256_sub_ps(iy2,jy1);
2209 dz21 = _mm256_sub_ps(iz2,jz1);
2210 dx22 = _mm256_sub_ps(ix2,jx2);
2211 dy22 = _mm256_sub_ps(iy2,jy2);
2212 dz22 = _mm256_sub_ps(iz2,jz2);
2214 /* Calculate squared distance and things based on it */
2215 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2216 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2217 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2218 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2219 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2220 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2221 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2222 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2223 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2225 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2226 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
2227 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
2228 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
2229 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2230 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2231 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
2232 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2233 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2235 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2237 fjx0 = _mm256_setzero_ps();
2238 fjy0 = _mm256_setzero_ps();
2239 fjz0 = _mm256_setzero_ps();
2240 fjx1 = _mm256_setzero_ps();
2241 fjy1 = _mm256_setzero_ps();
2242 fjz1 = _mm256_setzero_ps();
2243 fjx2 = _mm256_setzero_ps();
2244 fjy2 = _mm256_setzero_ps();
2245 fjz2 = _mm256_setzero_ps();
2247 /**************************
2248 * CALCULATE INTERACTIONS *
2249 **************************/
2251 r00 = _mm256_mul_ps(rsq00,rinv00);
2252 r00 = _mm256_andnot_ps(dummy_mask,r00);
2254 /* Calculate table index by multiplying r with table scale and truncate to integer */
2255 rt = _mm256_mul_ps(r00,vftabscale);
2256 vfitab = _mm256_cvttps_epi32(rt);
2257 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2258 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2259 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2260 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2261 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2262 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2264 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2265 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2266 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2267 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2268 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2269 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2270 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2271 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2272 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2273 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2274 Heps = _mm256_mul_ps(vfeps,H);
2275 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2276 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2277 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2279 /* LENNARD-JONES DISPERSION/REPULSION */
2281 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2282 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2284 fscal = _mm256_add_ps(felec,fvdw);
2286 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2288 /* Calculate temporary vectorial force */
2289 tx = _mm256_mul_ps(fscal,dx00);
2290 ty = _mm256_mul_ps(fscal,dy00);
2291 tz = _mm256_mul_ps(fscal,dz00);
2293 /* Update vectorial force */
2294 fix0 = _mm256_add_ps(fix0,tx);
2295 fiy0 = _mm256_add_ps(fiy0,ty);
2296 fiz0 = _mm256_add_ps(fiz0,tz);
2298 fjx0 = _mm256_add_ps(fjx0,tx);
2299 fjy0 = _mm256_add_ps(fjy0,ty);
2300 fjz0 = _mm256_add_ps(fjz0,tz);
2302 /**************************
2303 * CALCULATE INTERACTIONS *
2304 **************************/
2306 r01 = _mm256_mul_ps(rsq01,rinv01);
2307 r01 = _mm256_andnot_ps(dummy_mask,r01);
2309 /* Calculate table index by multiplying r with table scale and truncate to integer */
2310 rt = _mm256_mul_ps(r01,vftabscale);
2311 vfitab = _mm256_cvttps_epi32(rt);
2312 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2313 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2314 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2315 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2316 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2317 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2319 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2320 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2321 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2322 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2323 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2324 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2325 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2326 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2327 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2328 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2329 Heps = _mm256_mul_ps(vfeps,H);
2330 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2331 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2332 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2336 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2338 /* Calculate temporary vectorial force */
2339 tx = _mm256_mul_ps(fscal,dx01);
2340 ty = _mm256_mul_ps(fscal,dy01);
2341 tz = _mm256_mul_ps(fscal,dz01);
2343 /* Update vectorial force */
2344 fix0 = _mm256_add_ps(fix0,tx);
2345 fiy0 = _mm256_add_ps(fiy0,ty);
2346 fiz0 = _mm256_add_ps(fiz0,tz);
2348 fjx1 = _mm256_add_ps(fjx1,tx);
2349 fjy1 = _mm256_add_ps(fjy1,ty);
2350 fjz1 = _mm256_add_ps(fjz1,tz);
2352 /**************************
2353 * CALCULATE INTERACTIONS *
2354 **************************/
2356 r02 = _mm256_mul_ps(rsq02,rinv02);
2357 r02 = _mm256_andnot_ps(dummy_mask,r02);
2359 /* Calculate table index by multiplying r with table scale and truncate to integer */
2360 rt = _mm256_mul_ps(r02,vftabscale);
2361 vfitab = _mm256_cvttps_epi32(rt);
2362 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2363 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2364 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2365 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2366 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2367 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2369 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2370 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2371 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2372 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2373 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2374 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2375 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2376 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2377 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2378 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2379 Heps = _mm256_mul_ps(vfeps,H);
2380 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2381 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2382 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2386 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2388 /* Calculate temporary vectorial force */
2389 tx = _mm256_mul_ps(fscal,dx02);
2390 ty = _mm256_mul_ps(fscal,dy02);
2391 tz = _mm256_mul_ps(fscal,dz02);
2393 /* Update vectorial force */
2394 fix0 = _mm256_add_ps(fix0,tx);
2395 fiy0 = _mm256_add_ps(fiy0,ty);
2396 fiz0 = _mm256_add_ps(fiz0,tz);
2398 fjx2 = _mm256_add_ps(fjx2,tx);
2399 fjy2 = _mm256_add_ps(fjy2,ty);
2400 fjz2 = _mm256_add_ps(fjz2,tz);
2402 /**************************
2403 * CALCULATE INTERACTIONS *
2404 **************************/
2406 r10 = _mm256_mul_ps(rsq10,rinv10);
2407 r10 = _mm256_andnot_ps(dummy_mask,r10);
2409 /* Calculate table index by multiplying r with table scale and truncate to integer */
2410 rt = _mm256_mul_ps(r10,vftabscale);
2411 vfitab = _mm256_cvttps_epi32(rt);
2412 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2413 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2414 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2415 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2416 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2417 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2419 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2420 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2421 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2422 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2423 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2424 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2425 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2426 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2427 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2428 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2429 Heps = _mm256_mul_ps(vfeps,H);
2430 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2431 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2432 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2436 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2438 /* Calculate temporary vectorial force */
2439 tx = _mm256_mul_ps(fscal,dx10);
2440 ty = _mm256_mul_ps(fscal,dy10);
2441 tz = _mm256_mul_ps(fscal,dz10);
2443 /* Update vectorial force */
2444 fix1 = _mm256_add_ps(fix1,tx);
2445 fiy1 = _mm256_add_ps(fiy1,ty);
2446 fiz1 = _mm256_add_ps(fiz1,tz);
2448 fjx0 = _mm256_add_ps(fjx0,tx);
2449 fjy0 = _mm256_add_ps(fjy0,ty);
2450 fjz0 = _mm256_add_ps(fjz0,tz);
2452 /**************************
2453 * CALCULATE INTERACTIONS *
2454 **************************/
2456 r11 = _mm256_mul_ps(rsq11,rinv11);
2457 r11 = _mm256_andnot_ps(dummy_mask,r11);
2459 /* Calculate table index by multiplying r with table scale and truncate to integer */
2460 rt = _mm256_mul_ps(r11,vftabscale);
2461 vfitab = _mm256_cvttps_epi32(rt);
2462 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2463 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2464 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2465 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2466 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2467 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2469 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2470 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2471 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2472 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2473 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2474 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2475 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2476 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2477 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2478 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2479 Heps = _mm256_mul_ps(vfeps,H);
2480 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2481 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2482 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2486 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2488 /* Calculate temporary vectorial force */
2489 tx = _mm256_mul_ps(fscal,dx11);
2490 ty = _mm256_mul_ps(fscal,dy11);
2491 tz = _mm256_mul_ps(fscal,dz11);
2493 /* Update vectorial force */
2494 fix1 = _mm256_add_ps(fix1,tx);
2495 fiy1 = _mm256_add_ps(fiy1,ty);
2496 fiz1 = _mm256_add_ps(fiz1,tz);
2498 fjx1 = _mm256_add_ps(fjx1,tx);
2499 fjy1 = _mm256_add_ps(fjy1,ty);
2500 fjz1 = _mm256_add_ps(fjz1,tz);
2502 /**************************
2503 * CALCULATE INTERACTIONS *
2504 **************************/
2506 r12 = _mm256_mul_ps(rsq12,rinv12);
2507 r12 = _mm256_andnot_ps(dummy_mask,r12);
2509 /* Calculate table index by multiplying r with table scale and truncate to integer */
2510 rt = _mm256_mul_ps(r12,vftabscale);
2511 vfitab = _mm256_cvttps_epi32(rt);
2512 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2513 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2514 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2515 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2516 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2517 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2519 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2520 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2521 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2522 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2523 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2524 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2525 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2526 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2527 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2528 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2529 Heps = _mm256_mul_ps(vfeps,H);
2530 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2531 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2532 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2536 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2538 /* Calculate temporary vectorial force */
2539 tx = _mm256_mul_ps(fscal,dx12);
2540 ty = _mm256_mul_ps(fscal,dy12);
2541 tz = _mm256_mul_ps(fscal,dz12);
2543 /* Update vectorial force */
2544 fix1 = _mm256_add_ps(fix1,tx);
2545 fiy1 = _mm256_add_ps(fiy1,ty);
2546 fiz1 = _mm256_add_ps(fiz1,tz);
2548 fjx2 = _mm256_add_ps(fjx2,tx);
2549 fjy2 = _mm256_add_ps(fjy2,ty);
2550 fjz2 = _mm256_add_ps(fjz2,tz);
2552 /**************************
2553 * CALCULATE INTERACTIONS *
2554 **************************/
2556 r20 = _mm256_mul_ps(rsq20,rinv20);
2557 r20 = _mm256_andnot_ps(dummy_mask,r20);
2559 /* Calculate table index by multiplying r with table scale and truncate to integer */
2560 rt = _mm256_mul_ps(r20,vftabscale);
2561 vfitab = _mm256_cvttps_epi32(rt);
2562 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2563 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2564 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2565 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2566 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2567 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2569 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2570 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2571 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2572 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2573 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2574 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2575 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2576 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2577 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2578 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2579 Heps = _mm256_mul_ps(vfeps,H);
2580 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2581 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2582 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2586 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2588 /* Calculate temporary vectorial force */
2589 tx = _mm256_mul_ps(fscal,dx20);
2590 ty = _mm256_mul_ps(fscal,dy20);
2591 tz = _mm256_mul_ps(fscal,dz20);
2593 /* Update vectorial force */
2594 fix2 = _mm256_add_ps(fix2,tx);
2595 fiy2 = _mm256_add_ps(fiy2,ty);
2596 fiz2 = _mm256_add_ps(fiz2,tz);
2598 fjx0 = _mm256_add_ps(fjx0,tx);
2599 fjy0 = _mm256_add_ps(fjy0,ty);
2600 fjz0 = _mm256_add_ps(fjz0,tz);
2602 /**************************
2603 * CALCULATE INTERACTIONS *
2604 **************************/
2606 r21 = _mm256_mul_ps(rsq21,rinv21);
2607 r21 = _mm256_andnot_ps(dummy_mask,r21);
2609 /* Calculate table index by multiplying r with table scale and truncate to integer */
2610 rt = _mm256_mul_ps(r21,vftabscale);
2611 vfitab = _mm256_cvttps_epi32(rt);
2612 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2613 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2614 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2615 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2616 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2617 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2619 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2620 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2621 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2622 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2623 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2624 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2625 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2626 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2627 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2628 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2629 Heps = _mm256_mul_ps(vfeps,H);
2630 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2631 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2632 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2636 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2638 /* Calculate temporary vectorial force */
2639 tx = _mm256_mul_ps(fscal,dx21);
2640 ty = _mm256_mul_ps(fscal,dy21);
2641 tz = _mm256_mul_ps(fscal,dz21);
2643 /* Update vectorial force */
2644 fix2 = _mm256_add_ps(fix2,tx);
2645 fiy2 = _mm256_add_ps(fiy2,ty);
2646 fiz2 = _mm256_add_ps(fiz2,tz);
2648 fjx1 = _mm256_add_ps(fjx1,tx);
2649 fjy1 = _mm256_add_ps(fjy1,ty);
2650 fjz1 = _mm256_add_ps(fjz1,tz);
2652 /**************************
2653 * CALCULATE INTERACTIONS *
2654 **************************/
2656 r22 = _mm256_mul_ps(rsq22,rinv22);
2657 r22 = _mm256_andnot_ps(dummy_mask,r22);
2659 /* Calculate table index by multiplying r with table scale and truncate to integer */
2660 rt = _mm256_mul_ps(r22,vftabscale);
2661 vfitab = _mm256_cvttps_epi32(rt);
2662 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2663 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2664 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2665 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2666 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2667 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2669 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2670 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2671 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2672 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2673 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2674 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2675 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2676 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2677 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2678 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2679 Heps = _mm256_mul_ps(vfeps,H);
2680 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2681 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2682 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2686 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2688 /* Calculate temporary vectorial force */
2689 tx = _mm256_mul_ps(fscal,dx22);
2690 ty = _mm256_mul_ps(fscal,dy22);
2691 tz = _mm256_mul_ps(fscal,dz22);
2693 /* Update vectorial force */
2694 fix2 = _mm256_add_ps(fix2,tx);
2695 fiy2 = _mm256_add_ps(fiy2,ty);
2696 fiz2 = _mm256_add_ps(fiz2,tz);
2698 fjx2 = _mm256_add_ps(fjx2,tx);
2699 fjy2 = _mm256_add_ps(fjy2,ty);
2700 fjz2 = _mm256_add_ps(fjz2,tz);
2702 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2703 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2704 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2705 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2706 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2707 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2708 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2709 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2711 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2712 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2714 /* Inner loop uses 368 flops */
2717 /* End of innermost loop */
2719 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2720 f+i_coord_offset,fshift+i_shift_offset);
2722 /* Increment number of inner iterations */
2723 inneriter += j_index_end - j_index_start;
2725 /* Outer loop uses 18 flops */
2728 /* Increment number of outer iterations */
2731 /* Update outer/inner flops */
2733 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);