2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
96 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
100 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
102 __m128i vfitab_lo,vfitab_hi;
103 __m128i ifour = _mm_set1_epi32(4);
104 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
106 __m256 dummy_mask,cutoff_mask;
107 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
108 __m256 one = _mm256_set1_ps(1.0);
109 __m256 two = _mm256_set1_ps(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm256_set1_ps(fr->epsfac);
122 charge = mdatoms->chargeA;
123 nvdwtype = fr->ntype;
125 vdwtype = mdatoms->typeA;
127 vftab = kernel_data->table_elec_vdw->data;
128 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
130 /* Setup water-specific parameters */
131 inr = nlist->iinr[0];
132 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
133 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
134 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
135 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
137 jq0 = _mm256_set1_ps(charge[inr+0]);
138 jq1 = _mm256_set1_ps(charge[inr+1]);
139 jq2 = _mm256_set1_ps(charge[inr+2]);
140 vdwjidx0A = 2*vdwtype[inr+0];
141 qq00 = _mm256_mul_ps(iq0,jq0);
142 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
143 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
144 qq01 = _mm256_mul_ps(iq0,jq1);
145 qq02 = _mm256_mul_ps(iq0,jq2);
146 qq10 = _mm256_mul_ps(iq1,jq0);
147 qq11 = _mm256_mul_ps(iq1,jq1);
148 qq12 = _mm256_mul_ps(iq1,jq2);
149 qq20 = _mm256_mul_ps(iq2,jq0);
150 qq21 = _mm256_mul_ps(iq2,jq1);
151 qq22 = _mm256_mul_ps(iq2,jq2);
153 /* Avoid stupid compiler warnings */
154 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
167 for(iidx=0;iidx<4*DIM;iidx++)
172 /* Start outer loop over neighborlists */
173 for(iidx=0; iidx<nri; iidx++)
175 /* Load shift vector for this list */
176 i_shift_offset = DIM*shiftidx[iidx];
178 /* Load limits for loop over neighbors */
179 j_index_start = jindex[iidx];
180 j_index_end = jindex[iidx+1];
182 /* Get outer coordinate index */
184 i_coord_offset = DIM*inr;
186 /* Load i particle coords and add shift vector */
187 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
188 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
190 fix0 = _mm256_setzero_ps();
191 fiy0 = _mm256_setzero_ps();
192 fiz0 = _mm256_setzero_ps();
193 fix1 = _mm256_setzero_ps();
194 fiy1 = _mm256_setzero_ps();
195 fiz1 = _mm256_setzero_ps();
196 fix2 = _mm256_setzero_ps();
197 fiy2 = _mm256_setzero_ps();
198 fiz2 = _mm256_setzero_ps();
200 /* Reset potential sums */
201 velecsum = _mm256_setzero_ps();
202 vvdwsum = _mm256_setzero_ps();
204 /* Start inner kernel loop */
205 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
208 /* Get j neighbor index, and coordinate index */
217 j_coord_offsetA = DIM*jnrA;
218 j_coord_offsetB = DIM*jnrB;
219 j_coord_offsetC = DIM*jnrC;
220 j_coord_offsetD = DIM*jnrD;
221 j_coord_offsetE = DIM*jnrE;
222 j_coord_offsetF = DIM*jnrF;
223 j_coord_offsetG = DIM*jnrG;
224 j_coord_offsetH = DIM*jnrH;
226 /* load j atom coordinates */
227 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
228 x+j_coord_offsetC,x+j_coord_offsetD,
229 x+j_coord_offsetE,x+j_coord_offsetF,
230 x+j_coord_offsetG,x+j_coord_offsetH,
231 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
233 /* Calculate displacement vector */
234 dx00 = _mm256_sub_ps(ix0,jx0);
235 dy00 = _mm256_sub_ps(iy0,jy0);
236 dz00 = _mm256_sub_ps(iz0,jz0);
237 dx01 = _mm256_sub_ps(ix0,jx1);
238 dy01 = _mm256_sub_ps(iy0,jy1);
239 dz01 = _mm256_sub_ps(iz0,jz1);
240 dx02 = _mm256_sub_ps(ix0,jx2);
241 dy02 = _mm256_sub_ps(iy0,jy2);
242 dz02 = _mm256_sub_ps(iz0,jz2);
243 dx10 = _mm256_sub_ps(ix1,jx0);
244 dy10 = _mm256_sub_ps(iy1,jy0);
245 dz10 = _mm256_sub_ps(iz1,jz0);
246 dx11 = _mm256_sub_ps(ix1,jx1);
247 dy11 = _mm256_sub_ps(iy1,jy1);
248 dz11 = _mm256_sub_ps(iz1,jz1);
249 dx12 = _mm256_sub_ps(ix1,jx2);
250 dy12 = _mm256_sub_ps(iy1,jy2);
251 dz12 = _mm256_sub_ps(iz1,jz2);
252 dx20 = _mm256_sub_ps(ix2,jx0);
253 dy20 = _mm256_sub_ps(iy2,jy0);
254 dz20 = _mm256_sub_ps(iz2,jz0);
255 dx21 = _mm256_sub_ps(ix2,jx1);
256 dy21 = _mm256_sub_ps(iy2,jy1);
257 dz21 = _mm256_sub_ps(iz2,jz1);
258 dx22 = _mm256_sub_ps(ix2,jx2);
259 dy22 = _mm256_sub_ps(iy2,jy2);
260 dz22 = _mm256_sub_ps(iz2,jz2);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
264 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
265 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
266 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
267 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
268 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
269 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
270 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
271 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
273 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
274 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
275 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
276 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
277 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
278 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
279 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
280 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
281 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
283 fjx0 = _mm256_setzero_ps();
284 fjy0 = _mm256_setzero_ps();
285 fjz0 = _mm256_setzero_ps();
286 fjx1 = _mm256_setzero_ps();
287 fjy1 = _mm256_setzero_ps();
288 fjz1 = _mm256_setzero_ps();
289 fjx2 = _mm256_setzero_ps();
290 fjy2 = _mm256_setzero_ps();
291 fjz2 = _mm256_setzero_ps();
293 /**************************
294 * CALCULATE INTERACTIONS *
295 **************************/
297 r00 = _mm256_mul_ps(rsq00,rinv00);
299 /* Calculate table index by multiplying r with table scale and truncate to integer */
300 rt = _mm256_mul_ps(r00,vftabscale);
301 vfitab = _mm256_cvttps_epi32(rt);
302 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
303 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
304 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
305 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
306 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
307 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
309 /* CUBIC SPLINE TABLE ELECTROSTATICS */
310 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
311 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
312 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
313 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
314 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
315 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
316 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
317 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
318 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
319 Heps = _mm256_mul_ps(vfeps,H);
320 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
321 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
322 velec = _mm256_mul_ps(qq00,VV);
323 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
324 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
326 /* CUBIC SPLINE TABLE DISPERSION */
327 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
328 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
329 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
330 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
331 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
332 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
333 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
334 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
335 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
336 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
337 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
338 Heps = _mm256_mul_ps(vfeps,H);
339 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
340 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
341 vvdw6 = _mm256_mul_ps(c6_00,VV);
342 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
343 fvdw6 = _mm256_mul_ps(c6_00,FF);
345 /* CUBIC SPLINE TABLE REPULSION */
346 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
347 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
348 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
349 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
350 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
351 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
352 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
353 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
354 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
355 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
356 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
357 Heps = _mm256_mul_ps(vfeps,H);
358 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
359 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
360 vvdw12 = _mm256_mul_ps(c12_00,VV);
361 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
362 fvdw12 = _mm256_mul_ps(c12_00,FF);
363 vvdw = _mm256_add_ps(vvdw12,vvdw6);
364 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
366 /* Update potential sum for this i atom from the interaction with this j atom. */
367 velecsum = _mm256_add_ps(velecsum,velec);
368 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
370 fscal = _mm256_add_ps(felec,fvdw);
372 /* Calculate temporary vectorial force */
373 tx = _mm256_mul_ps(fscal,dx00);
374 ty = _mm256_mul_ps(fscal,dy00);
375 tz = _mm256_mul_ps(fscal,dz00);
377 /* Update vectorial force */
378 fix0 = _mm256_add_ps(fix0,tx);
379 fiy0 = _mm256_add_ps(fiy0,ty);
380 fiz0 = _mm256_add_ps(fiz0,tz);
382 fjx0 = _mm256_add_ps(fjx0,tx);
383 fjy0 = _mm256_add_ps(fjy0,ty);
384 fjz0 = _mm256_add_ps(fjz0,tz);
386 /**************************
387 * CALCULATE INTERACTIONS *
388 **************************/
390 r01 = _mm256_mul_ps(rsq01,rinv01);
392 /* Calculate table index by multiplying r with table scale and truncate to integer */
393 rt = _mm256_mul_ps(r01,vftabscale);
394 vfitab = _mm256_cvttps_epi32(rt);
395 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
396 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
397 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
398 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
399 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
400 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
402 /* CUBIC SPLINE TABLE ELECTROSTATICS */
403 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
404 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
405 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
406 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
407 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
408 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
409 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
410 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
411 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
412 Heps = _mm256_mul_ps(vfeps,H);
413 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
414 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
415 velec = _mm256_mul_ps(qq01,VV);
416 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
417 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
419 /* Update potential sum for this i atom from the interaction with this j atom. */
420 velecsum = _mm256_add_ps(velecsum,velec);
424 /* Calculate temporary vectorial force */
425 tx = _mm256_mul_ps(fscal,dx01);
426 ty = _mm256_mul_ps(fscal,dy01);
427 tz = _mm256_mul_ps(fscal,dz01);
429 /* Update vectorial force */
430 fix0 = _mm256_add_ps(fix0,tx);
431 fiy0 = _mm256_add_ps(fiy0,ty);
432 fiz0 = _mm256_add_ps(fiz0,tz);
434 fjx1 = _mm256_add_ps(fjx1,tx);
435 fjy1 = _mm256_add_ps(fjy1,ty);
436 fjz1 = _mm256_add_ps(fjz1,tz);
438 /**************************
439 * CALCULATE INTERACTIONS *
440 **************************/
442 r02 = _mm256_mul_ps(rsq02,rinv02);
444 /* Calculate table index by multiplying r with table scale and truncate to integer */
445 rt = _mm256_mul_ps(r02,vftabscale);
446 vfitab = _mm256_cvttps_epi32(rt);
447 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
448 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
449 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
450 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
451 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
452 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
454 /* CUBIC SPLINE TABLE ELECTROSTATICS */
455 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
456 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
457 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
458 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
459 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
460 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
461 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
462 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
463 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
464 Heps = _mm256_mul_ps(vfeps,H);
465 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
466 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
467 velec = _mm256_mul_ps(qq02,VV);
468 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
469 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
471 /* Update potential sum for this i atom from the interaction with this j atom. */
472 velecsum = _mm256_add_ps(velecsum,velec);
476 /* Calculate temporary vectorial force */
477 tx = _mm256_mul_ps(fscal,dx02);
478 ty = _mm256_mul_ps(fscal,dy02);
479 tz = _mm256_mul_ps(fscal,dz02);
481 /* Update vectorial force */
482 fix0 = _mm256_add_ps(fix0,tx);
483 fiy0 = _mm256_add_ps(fiy0,ty);
484 fiz0 = _mm256_add_ps(fiz0,tz);
486 fjx2 = _mm256_add_ps(fjx2,tx);
487 fjy2 = _mm256_add_ps(fjy2,ty);
488 fjz2 = _mm256_add_ps(fjz2,tz);
490 /**************************
491 * CALCULATE INTERACTIONS *
492 **************************/
494 r10 = _mm256_mul_ps(rsq10,rinv10);
496 /* Calculate table index by multiplying r with table scale and truncate to integer */
497 rt = _mm256_mul_ps(r10,vftabscale);
498 vfitab = _mm256_cvttps_epi32(rt);
499 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
500 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
501 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
502 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
503 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
504 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
506 /* CUBIC SPLINE TABLE ELECTROSTATICS */
507 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
508 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
509 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
510 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
511 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
512 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
513 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
514 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
515 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
516 Heps = _mm256_mul_ps(vfeps,H);
517 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
518 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
519 velec = _mm256_mul_ps(qq10,VV);
520 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
521 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
523 /* Update potential sum for this i atom from the interaction with this j atom. */
524 velecsum = _mm256_add_ps(velecsum,velec);
528 /* Calculate temporary vectorial force */
529 tx = _mm256_mul_ps(fscal,dx10);
530 ty = _mm256_mul_ps(fscal,dy10);
531 tz = _mm256_mul_ps(fscal,dz10);
533 /* Update vectorial force */
534 fix1 = _mm256_add_ps(fix1,tx);
535 fiy1 = _mm256_add_ps(fiy1,ty);
536 fiz1 = _mm256_add_ps(fiz1,tz);
538 fjx0 = _mm256_add_ps(fjx0,tx);
539 fjy0 = _mm256_add_ps(fjy0,ty);
540 fjz0 = _mm256_add_ps(fjz0,tz);
542 /**************************
543 * CALCULATE INTERACTIONS *
544 **************************/
546 r11 = _mm256_mul_ps(rsq11,rinv11);
548 /* Calculate table index by multiplying r with table scale and truncate to integer */
549 rt = _mm256_mul_ps(r11,vftabscale);
550 vfitab = _mm256_cvttps_epi32(rt);
551 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
552 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
553 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
554 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
555 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
556 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
558 /* CUBIC SPLINE TABLE ELECTROSTATICS */
559 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
560 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
561 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
562 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
563 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
564 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
565 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
566 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
567 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
568 Heps = _mm256_mul_ps(vfeps,H);
569 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
570 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
571 velec = _mm256_mul_ps(qq11,VV);
572 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
573 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
575 /* Update potential sum for this i atom from the interaction with this j atom. */
576 velecsum = _mm256_add_ps(velecsum,velec);
580 /* Calculate temporary vectorial force */
581 tx = _mm256_mul_ps(fscal,dx11);
582 ty = _mm256_mul_ps(fscal,dy11);
583 tz = _mm256_mul_ps(fscal,dz11);
585 /* Update vectorial force */
586 fix1 = _mm256_add_ps(fix1,tx);
587 fiy1 = _mm256_add_ps(fiy1,ty);
588 fiz1 = _mm256_add_ps(fiz1,tz);
590 fjx1 = _mm256_add_ps(fjx1,tx);
591 fjy1 = _mm256_add_ps(fjy1,ty);
592 fjz1 = _mm256_add_ps(fjz1,tz);
594 /**************************
595 * CALCULATE INTERACTIONS *
596 **************************/
598 r12 = _mm256_mul_ps(rsq12,rinv12);
600 /* Calculate table index by multiplying r with table scale and truncate to integer */
601 rt = _mm256_mul_ps(r12,vftabscale);
602 vfitab = _mm256_cvttps_epi32(rt);
603 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
604 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
605 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
606 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
607 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
608 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
610 /* CUBIC SPLINE TABLE ELECTROSTATICS */
611 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
612 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
613 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
614 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
615 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
616 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
617 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
618 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
619 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
620 Heps = _mm256_mul_ps(vfeps,H);
621 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
622 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
623 velec = _mm256_mul_ps(qq12,VV);
624 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
625 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
627 /* Update potential sum for this i atom from the interaction with this j atom. */
628 velecsum = _mm256_add_ps(velecsum,velec);
632 /* Calculate temporary vectorial force */
633 tx = _mm256_mul_ps(fscal,dx12);
634 ty = _mm256_mul_ps(fscal,dy12);
635 tz = _mm256_mul_ps(fscal,dz12);
637 /* Update vectorial force */
638 fix1 = _mm256_add_ps(fix1,tx);
639 fiy1 = _mm256_add_ps(fiy1,ty);
640 fiz1 = _mm256_add_ps(fiz1,tz);
642 fjx2 = _mm256_add_ps(fjx2,tx);
643 fjy2 = _mm256_add_ps(fjy2,ty);
644 fjz2 = _mm256_add_ps(fjz2,tz);
646 /**************************
647 * CALCULATE INTERACTIONS *
648 **************************/
650 r20 = _mm256_mul_ps(rsq20,rinv20);
652 /* Calculate table index by multiplying r with table scale and truncate to integer */
653 rt = _mm256_mul_ps(r20,vftabscale);
654 vfitab = _mm256_cvttps_epi32(rt);
655 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
656 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
657 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
658 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
659 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
660 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
662 /* CUBIC SPLINE TABLE ELECTROSTATICS */
663 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
664 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
665 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
666 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
667 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
668 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
669 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
670 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
671 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
672 Heps = _mm256_mul_ps(vfeps,H);
673 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
674 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
675 velec = _mm256_mul_ps(qq20,VV);
676 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
677 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
679 /* Update potential sum for this i atom from the interaction with this j atom. */
680 velecsum = _mm256_add_ps(velecsum,velec);
684 /* Calculate temporary vectorial force */
685 tx = _mm256_mul_ps(fscal,dx20);
686 ty = _mm256_mul_ps(fscal,dy20);
687 tz = _mm256_mul_ps(fscal,dz20);
689 /* Update vectorial force */
690 fix2 = _mm256_add_ps(fix2,tx);
691 fiy2 = _mm256_add_ps(fiy2,ty);
692 fiz2 = _mm256_add_ps(fiz2,tz);
694 fjx0 = _mm256_add_ps(fjx0,tx);
695 fjy0 = _mm256_add_ps(fjy0,ty);
696 fjz0 = _mm256_add_ps(fjz0,tz);
698 /**************************
699 * CALCULATE INTERACTIONS *
700 **************************/
702 r21 = _mm256_mul_ps(rsq21,rinv21);
704 /* Calculate table index by multiplying r with table scale and truncate to integer */
705 rt = _mm256_mul_ps(r21,vftabscale);
706 vfitab = _mm256_cvttps_epi32(rt);
707 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
708 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
709 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
710 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
711 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
712 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
714 /* CUBIC SPLINE TABLE ELECTROSTATICS */
715 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
716 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
717 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
718 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
719 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
720 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
721 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
722 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
723 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
724 Heps = _mm256_mul_ps(vfeps,H);
725 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
726 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
727 velec = _mm256_mul_ps(qq21,VV);
728 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
729 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
731 /* Update potential sum for this i atom from the interaction with this j atom. */
732 velecsum = _mm256_add_ps(velecsum,velec);
736 /* Calculate temporary vectorial force */
737 tx = _mm256_mul_ps(fscal,dx21);
738 ty = _mm256_mul_ps(fscal,dy21);
739 tz = _mm256_mul_ps(fscal,dz21);
741 /* Update vectorial force */
742 fix2 = _mm256_add_ps(fix2,tx);
743 fiy2 = _mm256_add_ps(fiy2,ty);
744 fiz2 = _mm256_add_ps(fiz2,tz);
746 fjx1 = _mm256_add_ps(fjx1,tx);
747 fjy1 = _mm256_add_ps(fjy1,ty);
748 fjz1 = _mm256_add_ps(fjz1,tz);
750 /**************************
751 * CALCULATE INTERACTIONS *
752 **************************/
754 r22 = _mm256_mul_ps(rsq22,rinv22);
756 /* Calculate table index by multiplying r with table scale and truncate to integer */
757 rt = _mm256_mul_ps(r22,vftabscale);
758 vfitab = _mm256_cvttps_epi32(rt);
759 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
760 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
761 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
762 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
763 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
764 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
766 /* CUBIC SPLINE TABLE ELECTROSTATICS */
767 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
768 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
769 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
770 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
771 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
772 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
773 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
774 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
775 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
776 Heps = _mm256_mul_ps(vfeps,H);
777 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
778 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
779 velec = _mm256_mul_ps(qq22,VV);
780 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
781 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
783 /* Update potential sum for this i atom from the interaction with this j atom. */
784 velecsum = _mm256_add_ps(velecsum,velec);
788 /* Calculate temporary vectorial force */
789 tx = _mm256_mul_ps(fscal,dx22);
790 ty = _mm256_mul_ps(fscal,dy22);
791 tz = _mm256_mul_ps(fscal,dz22);
793 /* Update vectorial force */
794 fix2 = _mm256_add_ps(fix2,tx);
795 fiy2 = _mm256_add_ps(fiy2,ty);
796 fiz2 = _mm256_add_ps(fiz2,tz);
798 fjx2 = _mm256_add_ps(fjx2,tx);
799 fjy2 = _mm256_add_ps(fjy2,ty);
800 fjz2 = _mm256_add_ps(fjz2,tz);
802 fjptrA = f+j_coord_offsetA;
803 fjptrB = f+j_coord_offsetB;
804 fjptrC = f+j_coord_offsetC;
805 fjptrD = f+j_coord_offsetD;
806 fjptrE = f+j_coord_offsetE;
807 fjptrF = f+j_coord_offsetF;
808 fjptrG = f+j_coord_offsetG;
809 fjptrH = f+j_coord_offsetH;
811 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
812 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
814 /* Inner loop uses 417 flops */
820 /* Get j neighbor index, and coordinate index */
821 jnrlistA = jjnr[jidx];
822 jnrlistB = jjnr[jidx+1];
823 jnrlistC = jjnr[jidx+2];
824 jnrlistD = jjnr[jidx+3];
825 jnrlistE = jjnr[jidx+4];
826 jnrlistF = jjnr[jidx+5];
827 jnrlistG = jjnr[jidx+6];
828 jnrlistH = jjnr[jidx+7];
829 /* Sign of each element will be negative for non-real atoms.
830 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
831 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
833 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
834 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
836 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
837 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
838 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
839 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
840 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
841 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
842 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
843 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
844 j_coord_offsetA = DIM*jnrA;
845 j_coord_offsetB = DIM*jnrB;
846 j_coord_offsetC = DIM*jnrC;
847 j_coord_offsetD = DIM*jnrD;
848 j_coord_offsetE = DIM*jnrE;
849 j_coord_offsetF = DIM*jnrF;
850 j_coord_offsetG = DIM*jnrG;
851 j_coord_offsetH = DIM*jnrH;
853 /* load j atom coordinates */
854 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
855 x+j_coord_offsetC,x+j_coord_offsetD,
856 x+j_coord_offsetE,x+j_coord_offsetF,
857 x+j_coord_offsetG,x+j_coord_offsetH,
858 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
860 /* Calculate displacement vector */
861 dx00 = _mm256_sub_ps(ix0,jx0);
862 dy00 = _mm256_sub_ps(iy0,jy0);
863 dz00 = _mm256_sub_ps(iz0,jz0);
864 dx01 = _mm256_sub_ps(ix0,jx1);
865 dy01 = _mm256_sub_ps(iy0,jy1);
866 dz01 = _mm256_sub_ps(iz0,jz1);
867 dx02 = _mm256_sub_ps(ix0,jx2);
868 dy02 = _mm256_sub_ps(iy0,jy2);
869 dz02 = _mm256_sub_ps(iz0,jz2);
870 dx10 = _mm256_sub_ps(ix1,jx0);
871 dy10 = _mm256_sub_ps(iy1,jy0);
872 dz10 = _mm256_sub_ps(iz1,jz0);
873 dx11 = _mm256_sub_ps(ix1,jx1);
874 dy11 = _mm256_sub_ps(iy1,jy1);
875 dz11 = _mm256_sub_ps(iz1,jz1);
876 dx12 = _mm256_sub_ps(ix1,jx2);
877 dy12 = _mm256_sub_ps(iy1,jy2);
878 dz12 = _mm256_sub_ps(iz1,jz2);
879 dx20 = _mm256_sub_ps(ix2,jx0);
880 dy20 = _mm256_sub_ps(iy2,jy0);
881 dz20 = _mm256_sub_ps(iz2,jz0);
882 dx21 = _mm256_sub_ps(ix2,jx1);
883 dy21 = _mm256_sub_ps(iy2,jy1);
884 dz21 = _mm256_sub_ps(iz2,jz1);
885 dx22 = _mm256_sub_ps(ix2,jx2);
886 dy22 = _mm256_sub_ps(iy2,jy2);
887 dz22 = _mm256_sub_ps(iz2,jz2);
889 /* Calculate squared distance and things based on it */
890 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
891 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
892 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
893 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
894 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
895 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
896 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
897 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
898 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
900 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
901 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
902 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
903 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
904 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
905 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
906 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
907 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
908 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
910 fjx0 = _mm256_setzero_ps();
911 fjy0 = _mm256_setzero_ps();
912 fjz0 = _mm256_setzero_ps();
913 fjx1 = _mm256_setzero_ps();
914 fjy1 = _mm256_setzero_ps();
915 fjz1 = _mm256_setzero_ps();
916 fjx2 = _mm256_setzero_ps();
917 fjy2 = _mm256_setzero_ps();
918 fjz2 = _mm256_setzero_ps();
920 /**************************
921 * CALCULATE INTERACTIONS *
922 **************************/
924 r00 = _mm256_mul_ps(rsq00,rinv00);
925 r00 = _mm256_andnot_ps(dummy_mask,r00);
927 /* Calculate table index by multiplying r with table scale and truncate to integer */
928 rt = _mm256_mul_ps(r00,vftabscale);
929 vfitab = _mm256_cvttps_epi32(rt);
930 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
931 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
932 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
933 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
934 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
935 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
937 /* CUBIC SPLINE TABLE ELECTROSTATICS */
938 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
939 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
940 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
941 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
942 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
943 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
944 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
945 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
946 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
947 Heps = _mm256_mul_ps(vfeps,H);
948 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
949 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
950 velec = _mm256_mul_ps(qq00,VV);
951 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
952 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
954 /* CUBIC SPLINE TABLE DISPERSION */
955 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
956 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
957 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
958 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
959 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
960 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
961 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
962 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
963 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
964 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
965 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
966 Heps = _mm256_mul_ps(vfeps,H);
967 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
968 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
969 vvdw6 = _mm256_mul_ps(c6_00,VV);
970 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
971 fvdw6 = _mm256_mul_ps(c6_00,FF);
973 /* CUBIC SPLINE TABLE REPULSION */
974 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
975 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
976 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
977 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
978 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
979 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
980 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
981 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
982 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
983 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
984 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
985 Heps = _mm256_mul_ps(vfeps,H);
986 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
987 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
988 vvdw12 = _mm256_mul_ps(c12_00,VV);
989 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
990 fvdw12 = _mm256_mul_ps(c12_00,FF);
991 vvdw = _mm256_add_ps(vvdw12,vvdw6);
992 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
994 /* Update potential sum for this i atom from the interaction with this j atom. */
995 velec = _mm256_andnot_ps(dummy_mask,velec);
996 velecsum = _mm256_add_ps(velecsum,velec);
997 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
998 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
1000 fscal = _mm256_add_ps(felec,fvdw);
1002 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1004 /* Calculate temporary vectorial force */
1005 tx = _mm256_mul_ps(fscal,dx00);
1006 ty = _mm256_mul_ps(fscal,dy00);
1007 tz = _mm256_mul_ps(fscal,dz00);
1009 /* Update vectorial force */
1010 fix0 = _mm256_add_ps(fix0,tx);
1011 fiy0 = _mm256_add_ps(fiy0,ty);
1012 fiz0 = _mm256_add_ps(fiz0,tz);
1014 fjx0 = _mm256_add_ps(fjx0,tx);
1015 fjy0 = _mm256_add_ps(fjy0,ty);
1016 fjz0 = _mm256_add_ps(fjz0,tz);
1018 /**************************
1019 * CALCULATE INTERACTIONS *
1020 **************************/
1022 r01 = _mm256_mul_ps(rsq01,rinv01);
1023 r01 = _mm256_andnot_ps(dummy_mask,r01);
1025 /* Calculate table index by multiplying r with table scale and truncate to integer */
1026 rt = _mm256_mul_ps(r01,vftabscale);
1027 vfitab = _mm256_cvttps_epi32(rt);
1028 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1029 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1030 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1031 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1032 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1033 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1035 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1036 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1037 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1038 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1039 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1040 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1041 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1042 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1043 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1044 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1045 Heps = _mm256_mul_ps(vfeps,H);
1046 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1047 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1048 velec = _mm256_mul_ps(qq01,VV);
1049 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1050 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1052 /* Update potential sum for this i atom from the interaction with this j atom. */
1053 velec = _mm256_andnot_ps(dummy_mask,velec);
1054 velecsum = _mm256_add_ps(velecsum,velec);
1058 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1060 /* Calculate temporary vectorial force */
1061 tx = _mm256_mul_ps(fscal,dx01);
1062 ty = _mm256_mul_ps(fscal,dy01);
1063 tz = _mm256_mul_ps(fscal,dz01);
1065 /* Update vectorial force */
1066 fix0 = _mm256_add_ps(fix0,tx);
1067 fiy0 = _mm256_add_ps(fiy0,ty);
1068 fiz0 = _mm256_add_ps(fiz0,tz);
1070 fjx1 = _mm256_add_ps(fjx1,tx);
1071 fjy1 = _mm256_add_ps(fjy1,ty);
1072 fjz1 = _mm256_add_ps(fjz1,tz);
1074 /**************************
1075 * CALCULATE INTERACTIONS *
1076 **************************/
1078 r02 = _mm256_mul_ps(rsq02,rinv02);
1079 r02 = _mm256_andnot_ps(dummy_mask,r02);
1081 /* Calculate table index by multiplying r with table scale and truncate to integer */
1082 rt = _mm256_mul_ps(r02,vftabscale);
1083 vfitab = _mm256_cvttps_epi32(rt);
1084 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1085 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1086 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1087 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1088 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1089 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1091 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1092 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1093 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1094 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1095 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1096 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1097 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1098 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1099 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1100 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1101 Heps = _mm256_mul_ps(vfeps,H);
1102 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1103 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1104 velec = _mm256_mul_ps(qq02,VV);
1105 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1106 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1108 /* Update potential sum for this i atom from the interaction with this j atom. */
1109 velec = _mm256_andnot_ps(dummy_mask,velec);
1110 velecsum = _mm256_add_ps(velecsum,velec);
1114 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1116 /* Calculate temporary vectorial force */
1117 tx = _mm256_mul_ps(fscal,dx02);
1118 ty = _mm256_mul_ps(fscal,dy02);
1119 tz = _mm256_mul_ps(fscal,dz02);
1121 /* Update vectorial force */
1122 fix0 = _mm256_add_ps(fix0,tx);
1123 fiy0 = _mm256_add_ps(fiy0,ty);
1124 fiz0 = _mm256_add_ps(fiz0,tz);
1126 fjx2 = _mm256_add_ps(fjx2,tx);
1127 fjy2 = _mm256_add_ps(fjy2,ty);
1128 fjz2 = _mm256_add_ps(fjz2,tz);
1130 /**************************
1131 * CALCULATE INTERACTIONS *
1132 **************************/
1134 r10 = _mm256_mul_ps(rsq10,rinv10);
1135 r10 = _mm256_andnot_ps(dummy_mask,r10);
1137 /* Calculate table index by multiplying r with table scale and truncate to integer */
1138 rt = _mm256_mul_ps(r10,vftabscale);
1139 vfitab = _mm256_cvttps_epi32(rt);
1140 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1141 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1142 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1143 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1144 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1145 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1147 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1148 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1149 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1150 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1151 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1152 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1153 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1154 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1155 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1156 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1157 Heps = _mm256_mul_ps(vfeps,H);
1158 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1159 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1160 velec = _mm256_mul_ps(qq10,VV);
1161 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1162 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1164 /* Update potential sum for this i atom from the interaction with this j atom. */
1165 velec = _mm256_andnot_ps(dummy_mask,velec);
1166 velecsum = _mm256_add_ps(velecsum,velec);
1170 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1172 /* Calculate temporary vectorial force */
1173 tx = _mm256_mul_ps(fscal,dx10);
1174 ty = _mm256_mul_ps(fscal,dy10);
1175 tz = _mm256_mul_ps(fscal,dz10);
1177 /* Update vectorial force */
1178 fix1 = _mm256_add_ps(fix1,tx);
1179 fiy1 = _mm256_add_ps(fiy1,ty);
1180 fiz1 = _mm256_add_ps(fiz1,tz);
1182 fjx0 = _mm256_add_ps(fjx0,tx);
1183 fjy0 = _mm256_add_ps(fjy0,ty);
1184 fjz0 = _mm256_add_ps(fjz0,tz);
1186 /**************************
1187 * CALCULATE INTERACTIONS *
1188 **************************/
1190 r11 = _mm256_mul_ps(rsq11,rinv11);
1191 r11 = _mm256_andnot_ps(dummy_mask,r11);
1193 /* Calculate table index by multiplying r with table scale and truncate to integer */
1194 rt = _mm256_mul_ps(r11,vftabscale);
1195 vfitab = _mm256_cvttps_epi32(rt);
1196 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1197 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1198 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1199 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1200 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1201 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1203 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1204 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1205 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1206 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1207 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1208 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1209 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1210 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1211 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1212 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1213 Heps = _mm256_mul_ps(vfeps,H);
1214 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1215 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1216 velec = _mm256_mul_ps(qq11,VV);
1217 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1218 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1220 /* Update potential sum for this i atom from the interaction with this j atom. */
1221 velec = _mm256_andnot_ps(dummy_mask,velec);
1222 velecsum = _mm256_add_ps(velecsum,velec);
1226 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1228 /* Calculate temporary vectorial force */
1229 tx = _mm256_mul_ps(fscal,dx11);
1230 ty = _mm256_mul_ps(fscal,dy11);
1231 tz = _mm256_mul_ps(fscal,dz11);
1233 /* Update vectorial force */
1234 fix1 = _mm256_add_ps(fix1,tx);
1235 fiy1 = _mm256_add_ps(fiy1,ty);
1236 fiz1 = _mm256_add_ps(fiz1,tz);
1238 fjx1 = _mm256_add_ps(fjx1,tx);
1239 fjy1 = _mm256_add_ps(fjy1,ty);
1240 fjz1 = _mm256_add_ps(fjz1,tz);
1242 /**************************
1243 * CALCULATE INTERACTIONS *
1244 **************************/
1246 r12 = _mm256_mul_ps(rsq12,rinv12);
1247 r12 = _mm256_andnot_ps(dummy_mask,r12);
1249 /* Calculate table index by multiplying r with table scale and truncate to integer */
1250 rt = _mm256_mul_ps(r12,vftabscale);
1251 vfitab = _mm256_cvttps_epi32(rt);
1252 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1253 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1254 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1255 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1256 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1257 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1259 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1260 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1261 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1262 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1263 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1264 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1265 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1266 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1267 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1268 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1269 Heps = _mm256_mul_ps(vfeps,H);
1270 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1271 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1272 velec = _mm256_mul_ps(qq12,VV);
1273 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1274 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1276 /* Update potential sum for this i atom from the interaction with this j atom. */
1277 velec = _mm256_andnot_ps(dummy_mask,velec);
1278 velecsum = _mm256_add_ps(velecsum,velec);
1282 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1284 /* Calculate temporary vectorial force */
1285 tx = _mm256_mul_ps(fscal,dx12);
1286 ty = _mm256_mul_ps(fscal,dy12);
1287 tz = _mm256_mul_ps(fscal,dz12);
1289 /* Update vectorial force */
1290 fix1 = _mm256_add_ps(fix1,tx);
1291 fiy1 = _mm256_add_ps(fiy1,ty);
1292 fiz1 = _mm256_add_ps(fiz1,tz);
1294 fjx2 = _mm256_add_ps(fjx2,tx);
1295 fjy2 = _mm256_add_ps(fjy2,ty);
1296 fjz2 = _mm256_add_ps(fjz2,tz);
1298 /**************************
1299 * CALCULATE INTERACTIONS *
1300 **************************/
1302 r20 = _mm256_mul_ps(rsq20,rinv20);
1303 r20 = _mm256_andnot_ps(dummy_mask,r20);
1305 /* Calculate table index by multiplying r with table scale and truncate to integer */
1306 rt = _mm256_mul_ps(r20,vftabscale);
1307 vfitab = _mm256_cvttps_epi32(rt);
1308 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1309 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1310 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1311 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1312 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1313 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1315 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1316 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1317 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1318 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1319 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1320 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1321 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1322 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1323 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1324 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1325 Heps = _mm256_mul_ps(vfeps,H);
1326 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1327 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1328 velec = _mm256_mul_ps(qq20,VV);
1329 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1330 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1332 /* Update potential sum for this i atom from the interaction with this j atom. */
1333 velec = _mm256_andnot_ps(dummy_mask,velec);
1334 velecsum = _mm256_add_ps(velecsum,velec);
1338 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1340 /* Calculate temporary vectorial force */
1341 tx = _mm256_mul_ps(fscal,dx20);
1342 ty = _mm256_mul_ps(fscal,dy20);
1343 tz = _mm256_mul_ps(fscal,dz20);
1345 /* Update vectorial force */
1346 fix2 = _mm256_add_ps(fix2,tx);
1347 fiy2 = _mm256_add_ps(fiy2,ty);
1348 fiz2 = _mm256_add_ps(fiz2,tz);
1350 fjx0 = _mm256_add_ps(fjx0,tx);
1351 fjy0 = _mm256_add_ps(fjy0,ty);
1352 fjz0 = _mm256_add_ps(fjz0,tz);
1354 /**************************
1355 * CALCULATE INTERACTIONS *
1356 **************************/
1358 r21 = _mm256_mul_ps(rsq21,rinv21);
1359 r21 = _mm256_andnot_ps(dummy_mask,r21);
1361 /* Calculate table index by multiplying r with table scale and truncate to integer */
1362 rt = _mm256_mul_ps(r21,vftabscale);
1363 vfitab = _mm256_cvttps_epi32(rt);
1364 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1365 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1366 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1367 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1368 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1369 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1371 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1372 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1373 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1374 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1375 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1376 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1377 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1378 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1379 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1380 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1381 Heps = _mm256_mul_ps(vfeps,H);
1382 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1383 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1384 velec = _mm256_mul_ps(qq21,VV);
1385 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1386 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1388 /* Update potential sum for this i atom from the interaction with this j atom. */
1389 velec = _mm256_andnot_ps(dummy_mask,velec);
1390 velecsum = _mm256_add_ps(velecsum,velec);
1394 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1396 /* Calculate temporary vectorial force */
1397 tx = _mm256_mul_ps(fscal,dx21);
1398 ty = _mm256_mul_ps(fscal,dy21);
1399 tz = _mm256_mul_ps(fscal,dz21);
1401 /* Update vectorial force */
1402 fix2 = _mm256_add_ps(fix2,tx);
1403 fiy2 = _mm256_add_ps(fiy2,ty);
1404 fiz2 = _mm256_add_ps(fiz2,tz);
1406 fjx1 = _mm256_add_ps(fjx1,tx);
1407 fjy1 = _mm256_add_ps(fjy1,ty);
1408 fjz1 = _mm256_add_ps(fjz1,tz);
1410 /**************************
1411 * CALCULATE INTERACTIONS *
1412 **************************/
1414 r22 = _mm256_mul_ps(rsq22,rinv22);
1415 r22 = _mm256_andnot_ps(dummy_mask,r22);
1417 /* Calculate table index by multiplying r with table scale and truncate to integer */
1418 rt = _mm256_mul_ps(r22,vftabscale);
1419 vfitab = _mm256_cvttps_epi32(rt);
1420 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1421 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1422 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1423 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1424 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1425 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1427 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1428 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1429 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1430 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1431 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1432 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1433 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1434 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1435 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1436 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1437 Heps = _mm256_mul_ps(vfeps,H);
1438 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1439 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1440 velec = _mm256_mul_ps(qq22,VV);
1441 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1442 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1444 /* Update potential sum for this i atom from the interaction with this j atom. */
1445 velec = _mm256_andnot_ps(dummy_mask,velec);
1446 velecsum = _mm256_add_ps(velecsum,velec);
1450 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1452 /* Calculate temporary vectorial force */
1453 tx = _mm256_mul_ps(fscal,dx22);
1454 ty = _mm256_mul_ps(fscal,dy22);
1455 tz = _mm256_mul_ps(fscal,dz22);
1457 /* Update vectorial force */
1458 fix2 = _mm256_add_ps(fix2,tx);
1459 fiy2 = _mm256_add_ps(fiy2,ty);
1460 fiz2 = _mm256_add_ps(fiz2,tz);
1462 fjx2 = _mm256_add_ps(fjx2,tx);
1463 fjy2 = _mm256_add_ps(fjy2,ty);
1464 fjz2 = _mm256_add_ps(fjz2,tz);
1466 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1467 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1468 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1469 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1470 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1471 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1472 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1473 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1475 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1476 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1478 /* Inner loop uses 426 flops */
1481 /* End of innermost loop */
1483 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1484 f+i_coord_offset,fshift+i_shift_offset);
1487 /* Update potential energies */
1488 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1489 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1491 /* Increment number of inner iterations */
1492 inneriter += j_index_end - j_index_start;
1494 /* Outer loop uses 20 flops */
1497 /* Increment number of outer iterations */
1500 /* Update outer/inner flops */
1502 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
1505 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single
1506 * Electrostatics interaction: CubicSplineTable
1507 * VdW interaction: CubicSplineTable
1508 * Geometry: Water3-Water3
1509 * Calculate force/pot: Force
1512 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single
1513 (t_nblist * gmx_restrict nlist,
1514 rvec * gmx_restrict xx,
1515 rvec * gmx_restrict ff,
1516 t_forcerec * gmx_restrict fr,
1517 t_mdatoms * gmx_restrict mdatoms,
1518 nb_kernel_data_t * gmx_restrict kernel_data,
1519 t_nrnb * gmx_restrict nrnb)
1521 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1522 * just 0 for non-waters.
1523 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1524 * jnr indices corresponding to data put in the four positions in the SIMD register.
1526 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1527 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1528 int jnrA,jnrB,jnrC,jnrD;
1529 int jnrE,jnrF,jnrG,jnrH;
1530 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1531 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1532 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1533 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1534 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1535 real rcutoff_scalar;
1536 real *shiftvec,*fshift,*x,*f;
1537 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1538 real scratch[4*DIM];
1539 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1540 real * vdwioffsetptr0;
1541 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1542 real * vdwioffsetptr1;
1543 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1544 real * vdwioffsetptr2;
1545 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1546 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1547 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1548 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1549 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1550 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1551 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1552 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1553 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1554 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1555 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1556 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1557 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1558 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1559 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1560 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1561 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1564 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1567 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1568 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1570 __m128i vfitab_lo,vfitab_hi;
1571 __m128i ifour = _mm_set1_epi32(4);
1572 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1574 __m256 dummy_mask,cutoff_mask;
1575 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1576 __m256 one = _mm256_set1_ps(1.0);
1577 __m256 two = _mm256_set1_ps(2.0);
1583 jindex = nlist->jindex;
1585 shiftidx = nlist->shift;
1587 shiftvec = fr->shift_vec[0];
1588 fshift = fr->fshift[0];
1589 facel = _mm256_set1_ps(fr->epsfac);
1590 charge = mdatoms->chargeA;
1591 nvdwtype = fr->ntype;
1592 vdwparam = fr->nbfp;
1593 vdwtype = mdatoms->typeA;
1595 vftab = kernel_data->table_elec_vdw->data;
1596 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
1598 /* Setup water-specific parameters */
1599 inr = nlist->iinr[0];
1600 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1601 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1602 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1603 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1605 jq0 = _mm256_set1_ps(charge[inr+0]);
1606 jq1 = _mm256_set1_ps(charge[inr+1]);
1607 jq2 = _mm256_set1_ps(charge[inr+2]);
1608 vdwjidx0A = 2*vdwtype[inr+0];
1609 qq00 = _mm256_mul_ps(iq0,jq0);
1610 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1611 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1612 qq01 = _mm256_mul_ps(iq0,jq1);
1613 qq02 = _mm256_mul_ps(iq0,jq2);
1614 qq10 = _mm256_mul_ps(iq1,jq0);
1615 qq11 = _mm256_mul_ps(iq1,jq1);
1616 qq12 = _mm256_mul_ps(iq1,jq2);
1617 qq20 = _mm256_mul_ps(iq2,jq0);
1618 qq21 = _mm256_mul_ps(iq2,jq1);
1619 qq22 = _mm256_mul_ps(iq2,jq2);
1621 /* Avoid stupid compiler warnings */
1622 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1623 j_coord_offsetA = 0;
1624 j_coord_offsetB = 0;
1625 j_coord_offsetC = 0;
1626 j_coord_offsetD = 0;
1627 j_coord_offsetE = 0;
1628 j_coord_offsetF = 0;
1629 j_coord_offsetG = 0;
1630 j_coord_offsetH = 0;
1635 for(iidx=0;iidx<4*DIM;iidx++)
1637 scratch[iidx] = 0.0;
1640 /* Start outer loop over neighborlists */
1641 for(iidx=0; iidx<nri; iidx++)
1643 /* Load shift vector for this list */
1644 i_shift_offset = DIM*shiftidx[iidx];
1646 /* Load limits for loop over neighbors */
1647 j_index_start = jindex[iidx];
1648 j_index_end = jindex[iidx+1];
1650 /* Get outer coordinate index */
1652 i_coord_offset = DIM*inr;
1654 /* Load i particle coords and add shift vector */
1655 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1656 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1658 fix0 = _mm256_setzero_ps();
1659 fiy0 = _mm256_setzero_ps();
1660 fiz0 = _mm256_setzero_ps();
1661 fix1 = _mm256_setzero_ps();
1662 fiy1 = _mm256_setzero_ps();
1663 fiz1 = _mm256_setzero_ps();
1664 fix2 = _mm256_setzero_ps();
1665 fiy2 = _mm256_setzero_ps();
1666 fiz2 = _mm256_setzero_ps();
1668 /* Start inner kernel loop */
1669 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1672 /* Get j neighbor index, and coordinate index */
1674 jnrB = jjnr[jidx+1];
1675 jnrC = jjnr[jidx+2];
1676 jnrD = jjnr[jidx+3];
1677 jnrE = jjnr[jidx+4];
1678 jnrF = jjnr[jidx+5];
1679 jnrG = jjnr[jidx+6];
1680 jnrH = jjnr[jidx+7];
1681 j_coord_offsetA = DIM*jnrA;
1682 j_coord_offsetB = DIM*jnrB;
1683 j_coord_offsetC = DIM*jnrC;
1684 j_coord_offsetD = DIM*jnrD;
1685 j_coord_offsetE = DIM*jnrE;
1686 j_coord_offsetF = DIM*jnrF;
1687 j_coord_offsetG = DIM*jnrG;
1688 j_coord_offsetH = DIM*jnrH;
1690 /* load j atom coordinates */
1691 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1692 x+j_coord_offsetC,x+j_coord_offsetD,
1693 x+j_coord_offsetE,x+j_coord_offsetF,
1694 x+j_coord_offsetG,x+j_coord_offsetH,
1695 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1697 /* Calculate displacement vector */
1698 dx00 = _mm256_sub_ps(ix0,jx0);
1699 dy00 = _mm256_sub_ps(iy0,jy0);
1700 dz00 = _mm256_sub_ps(iz0,jz0);
1701 dx01 = _mm256_sub_ps(ix0,jx1);
1702 dy01 = _mm256_sub_ps(iy0,jy1);
1703 dz01 = _mm256_sub_ps(iz0,jz1);
1704 dx02 = _mm256_sub_ps(ix0,jx2);
1705 dy02 = _mm256_sub_ps(iy0,jy2);
1706 dz02 = _mm256_sub_ps(iz0,jz2);
1707 dx10 = _mm256_sub_ps(ix1,jx0);
1708 dy10 = _mm256_sub_ps(iy1,jy0);
1709 dz10 = _mm256_sub_ps(iz1,jz0);
1710 dx11 = _mm256_sub_ps(ix1,jx1);
1711 dy11 = _mm256_sub_ps(iy1,jy1);
1712 dz11 = _mm256_sub_ps(iz1,jz1);
1713 dx12 = _mm256_sub_ps(ix1,jx2);
1714 dy12 = _mm256_sub_ps(iy1,jy2);
1715 dz12 = _mm256_sub_ps(iz1,jz2);
1716 dx20 = _mm256_sub_ps(ix2,jx0);
1717 dy20 = _mm256_sub_ps(iy2,jy0);
1718 dz20 = _mm256_sub_ps(iz2,jz0);
1719 dx21 = _mm256_sub_ps(ix2,jx1);
1720 dy21 = _mm256_sub_ps(iy2,jy1);
1721 dz21 = _mm256_sub_ps(iz2,jz1);
1722 dx22 = _mm256_sub_ps(ix2,jx2);
1723 dy22 = _mm256_sub_ps(iy2,jy2);
1724 dz22 = _mm256_sub_ps(iz2,jz2);
1726 /* Calculate squared distance and things based on it */
1727 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1728 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1729 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1730 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1731 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1732 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1733 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1734 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1735 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1737 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1738 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1739 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1740 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1741 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1742 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1743 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1744 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1745 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1747 fjx0 = _mm256_setzero_ps();
1748 fjy0 = _mm256_setzero_ps();
1749 fjz0 = _mm256_setzero_ps();
1750 fjx1 = _mm256_setzero_ps();
1751 fjy1 = _mm256_setzero_ps();
1752 fjz1 = _mm256_setzero_ps();
1753 fjx2 = _mm256_setzero_ps();
1754 fjy2 = _mm256_setzero_ps();
1755 fjz2 = _mm256_setzero_ps();
1757 /**************************
1758 * CALCULATE INTERACTIONS *
1759 **************************/
1761 r00 = _mm256_mul_ps(rsq00,rinv00);
1763 /* Calculate table index by multiplying r with table scale and truncate to integer */
1764 rt = _mm256_mul_ps(r00,vftabscale);
1765 vfitab = _mm256_cvttps_epi32(rt);
1766 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1767 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1768 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1769 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1770 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1771 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1773 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1774 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1775 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1776 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1777 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1778 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1779 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1780 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1781 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1782 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1783 Heps = _mm256_mul_ps(vfeps,H);
1784 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1785 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1786 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1788 /* CUBIC SPLINE TABLE DISPERSION */
1789 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1790 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1791 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1792 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1793 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1794 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1795 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1796 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1797 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1798 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1799 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1800 Heps = _mm256_mul_ps(vfeps,H);
1801 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1802 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1803 fvdw6 = _mm256_mul_ps(c6_00,FF);
1805 /* CUBIC SPLINE TABLE REPULSION */
1806 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1807 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1808 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1809 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1810 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1811 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1812 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1813 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1814 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1815 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1816 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1817 Heps = _mm256_mul_ps(vfeps,H);
1818 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1819 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1820 fvdw12 = _mm256_mul_ps(c12_00,FF);
1821 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1823 fscal = _mm256_add_ps(felec,fvdw);
1825 /* Calculate temporary vectorial force */
1826 tx = _mm256_mul_ps(fscal,dx00);
1827 ty = _mm256_mul_ps(fscal,dy00);
1828 tz = _mm256_mul_ps(fscal,dz00);
1830 /* Update vectorial force */
1831 fix0 = _mm256_add_ps(fix0,tx);
1832 fiy0 = _mm256_add_ps(fiy0,ty);
1833 fiz0 = _mm256_add_ps(fiz0,tz);
1835 fjx0 = _mm256_add_ps(fjx0,tx);
1836 fjy0 = _mm256_add_ps(fjy0,ty);
1837 fjz0 = _mm256_add_ps(fjz0,tz);
1839 /**************************
1840 * CALCULATE INTERACTIONS *
1841 **************************/
1843 r01 = _mm256_mul_ps(rsq01,rinv01);
1845 /* Calculate table index by multiplying r with table scale and truncate to integer */
1846 rt = _mm256_mul_ps(r01,vftabscale);
1847 vfitab = _mm256_cvttps_epi32(rt);
1848 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1849 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1850 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1851 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1852 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1853 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1855 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1856 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1857 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1858 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1859 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1860 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1861 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1862 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1863 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1864 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1865 Heps = _mm256_mul_ps(vfeps,H);
1866 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1867 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1868 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1872 /* Calculate temporary vectorial force */
1873 tx = _mm256_mul_ps(fscal,dx01);
1874 ty = _mm256_mul_ps(fscal,dy01);
1875 tz = _mm256_mul_ps(fscal,dz01);
1877 /* Update vectorial force */
1878 fix0 = _mm256_add_ps(fix0,tx);
1879 fiy0 = _mm256_add_ps(fiy0,ty);
1880 fiz0 = _mm256_add_ps(fiz0,tz);
1882 fjx1 = _mm256_add_ps(fjx1,tx);
1883 fjy1 = _mm256_add_ps(fjy1,ty);
1884 fjz1 = _mm256_add_ps(fjz1,tz);
1886 /**************************
1887 * CALCULATE INTERACTIONS *
1888 **************************/
1890 r02 = _mm256_mul_ps(rsq02,rinv02);
1892 /* Calculate table index by multiplying r with table scale and truncate to integer */
1893 rt = _mm256_mul_ps(r02,vftabscale);
1894 vfitab = _mm256_cvttps_epi32(rt);
1895 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1896 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1897 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1898 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1899 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1900 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1902 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1903 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1904 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1905 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1906 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1907 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1908 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1909 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1910 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1911 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1912 Heps = _mm256_mul_ps(vfeps,H);
1913 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1914 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1915 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1919 /* Calculate temporary vectorial force */
1920 tx = _mm256_mul_ps(fscal,dx02);
1921 ty = _mm256_mul_ps(fscal,dy02);
1922 tz = _mm256_mul_ps(fscal,dz02);
1924 /* Update vectorial force */
1925 fix0 = _mm256_add_ps(fix0,tx);
1926 fiy0 = _mm256_add_ps(fiy0,ty);
1927 fiz0 = _mm256_add_ps(fiz0,tz);
1929 fjx2 = _mm256_add_ps(fjx2,tx);
1930 fjy2 = _mm256_add_ps(fjy2,ty);
1931 fjz2 = _mm256_add_ps(fjz2,tz);
1933 /**************************
1934 * CALCULATE INTERACTIONS *
1935 **************************/
1937 r10 = _mm256_mul_ps(rsq10,rinv10);
1939 /* Calculate table index by multiplying r with table scale and truncate to integer */
1940 rt = _mm256_mul_ps(r10,vftabscale);
1941 vfitab = _mm256_cvttps_epi32(rt);
1942 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1943 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1944 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1945 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1946 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1947 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1949 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1950 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1951 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1952 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1953 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1954 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1955 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1956 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1957 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1958 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1959 Heps = _mm256_mul_ps(vfeps,H);
1960 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1961 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1962 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1966 /* Calculate temporary vectorial force */
1967 tx = _mm256_mul_ps(fscal,dx10);
1968 ty = _mm256_mul_ps(fscal,dy10);
1969 tz = _mm256_mul_ps(fscal,dz10);
1971 /* Update vectorial force */
1972 fix1 = _mm256_add_ps(fix1,tx);
1973 fiy1 = _mm256_add_ps(fiy1,ty);
1974 fiz1 = _mm256_add_ps(fiz1,tz);
1976 fjx0 = _mm256_add_ps(fjx0,tx);
1977 fjy0 = _mm256_add_ps(fjy0,ty);
1978 fjz0 = _mm256_add_ps(fjz0,tz);
1980 /**************************
1981 * CALCULATE INTERACTIONS *
1982 **************************/
1984 r11 = _mm256_mul_ps(rsq11,rinv11);
1986 /* Calculate table index by multiplying r with table scale and truncate to integer */
1987 rt = _mm256_mul_ps(r11,vftabscale);
1988 vfitab = _mm256_cvttps_epi32(rt);
1989 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1990 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1991 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1992 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1993 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1994 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1996 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1997 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1998 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1999 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2000 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2001 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2002 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2003 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2004 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2005 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2006 Heps = _mm256_mul_ps(vfeps,H);
2007 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2008 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2009 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2013 /* Calculate temporary vectorial force */
2014 tx = _mm256_mul_ps(fscal,dx11);
2015 ty = _mm256_mul_ps(fscal,dy11);
2016 tz = _mm256_mul_ps(fscal,dz11);
2018 /* Update vectorial force */
2019 fix1 = _mm256_add_ps(fix1,tx);
2020 fiy1 = _mm256_add_ps(fiy1,ty);
2021 fiz1 = _mm256_add_ps(fiz1,tz);
2023 fjx1 = _mm256_add_ps(fjx1,tx);
2024 fjy1 = _mm256_add_ps(fjy1,ty);
2025 fjz1 = _mm256_add_ps(fjz1,tz);
2027 /**************************
2028 * CALCULATE INTERACTIONS *
2029 **************************/
2031 r12 = _mm256_mul_ps(rsq12,rinv12);
2033 /* Calculate table index by multiplying r with table scale and truncate to integer */
2034 rt = _mm256_mul_ps(r12,vftabscale);
2035 vfitab = _mm256_cvttps_epi32(rt);
2036 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2037 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2038 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2039 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2040 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2041 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2043 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2044 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2045 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2046 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2047 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2048 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2049 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2050 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2051 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2052 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2053 Heps = _mm256_mul_ps(vfeps,H);
2054 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2055 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2056 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2060 /* Calculate temporary vectorial force */
2061 tx = _mm256_mul_ps(fscal,dx12);
2062 ty = _mm256_mul_ps(fscal,dy12);
2063 tz = _mm256_mul_ps(fscal,dz12);
2065 /* Update vectorial force */
2066 fix1 = _mm256_add_ps(fix1,tx);
2067 fiy1 = _mm256_add_ps(fiy1,ty);
2068 fiz1 = _mm256_add_ps(fiz1,tz);
2070 fjx2 = _mm256_add_ps(fjx2,tx);
2071 fjy2 = _mm256_add_ps(fjy2,ty);
2072 fjz2 = _mm256_add_ps(fjz2,tz);
2074 /**************************
2075 * CALCULATE INTERACTIONS *
2076 **************************/
2078 r20 = _mm256_mul_ps(rsq20,rinv20);
2080 /* Calculate table index by multiplying r with table scale and truncate to integer */
2081 rt = _mm256_mul_ps(r20,vftabscale);
2082 vfitab = _mm256_cvttps_epi32(rt);
2083 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2084 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2085 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2086 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2087 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2088 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2090 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2091 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2092 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2093 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2094 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2095 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2096 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2097 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2098 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2099 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2100 Heps = _mm256_mul_ps(vfeps,H);
2101 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2102 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2103 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2107 /* Calculate temporary vectorial force */
2108 tx = _mm256_mul_ps(fscal,dx20);
2109 ty = _mm256_mul_ps(fscal,dy20);
2110 tz = _mm256_mul_ps(fscal,dz20);
2112 /* Update vectorial force */
2113 fix2 = _mm256_add_ps(fix2,tx);
2114 fiy2 = _mm256_add_ps(fiy2,ty);
2115 fiz2 = _mm256_add_ps(fiz2,tz);
2117 fjx0 = _mm256_add_ps(fjx0,tx);
2118 fjy0 = _mm256_add_ps(fjy0,ty);
2119 fjz0 = _mm256_add_ps(fjz0,tz);
2121 /**************************
2122 * CALCULATE INTERACTIONS *
2123 **************************/
2125 r21 = _mm256_mul_ps(rsq21,rinv21);
2127 /* Calculate table index by multiplying r with table scale and truncate to integer */
2128 rt = _mm256_mul_ps(r21,vftabscale);
2129 vfitab = _mm256_cvttps_epi32(rt);
2130 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2131 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2132 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2133 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2134 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2135 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2137 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2138 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2139 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2140 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2141 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2142 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2143 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2144 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2145 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2146 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2147 Heps = _mm256_mul_ps(vfeps,H);
2148 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2149 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2150 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2154 /* Calculate temporary vectorial force */
2155 tx = _mm256_mul_ps(fscal,dx21);
2156 ty = _mm256_mul_ps(fscal,dy21);
2157 tz = _mm256_mul_ps(fscal,dz21);
2159 /* Update vectorial force */
2160 fix2 = _mm256_add_ps(fix2,tx);
2161 fiy2 = _mm256_add_ps(fiy2,ty);
2162 fiz2 = _mm256_add_ps(fiz2,tz);
2164 fjx1 = _mm256_add_ps(fjx1,tx);
2165 fjy1 = _mm256_add_ps(fjy1,ty);
2166 fjz1 = _mm256_add_ps(fjz1,tz);
2168 /**************************
2169 * CALCULATE INTERACTIONS *
2170 **************************/
2172 r22 = _mm256_mul_ps(rsq22,rinv22);
2174 /* Calculate table index by multiplying r with table scale and truncate to integer */
2175 rt = _mm256_mul_ps(r22,vftabscale);
2176 vfitab = _mm256_cvttps_epi32(rt);
2177 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2178 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2179 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2180 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2181 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2182 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2184 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2185 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2186 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2187 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2188 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2189 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2190 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2191 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2192 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2193 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2194 Heps = _mm256_mul_ps(vfeps,H);
2195 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2196 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2197 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2201 /* Calculate temporary vectorial force */
2202 tx = _mm256_mul_ps(fscal,dx22);
2203 ty = _mm256_mul_ps(fscal,dy22);
2204 tz = _mm256_mul_ps(fscal,dz22);
2206 /* Update vectorial force */
2207 fix2 = _mm256_add_ps(fix2,tx);
2208 fiy2 = _mm256_add_ps(fiy2,ty);
2209 fiz2 = _mm256_add_ps(fiz2,tz);
2211 fjx2 = _mm256_add_ps(fjx2,tx);
2212 fjy2 = _mm256_add_ps(fjy2,ty);
2213 fjz2 = _mm256_add_ps(fjz2,tz);
2215 fjptrA = f+j_coord_offsetA;
2216 fjptrB = f+j_coord_offsetB;
2217 fjptrC = f+j_coord_offsetC;
2218 fjptrD = f+j_coord_offsetD;
2219 fjptrE = f+j_coord_offsetE;
2220 fjptrF = f+j_coord_offsetF;
2221 fjptrG = f+j_coord_offsetG;
2222 fjptrH = f+j_coord_offsetH;
2224 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2225 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2227 /* Inner loop uses 373 flops */
2230 if(jidx<j_index_end)
2233 /* Get j neighbor index, and coordinate index */
2234 jnrlistA = jjnr[jidx];
2235 jnrlistB = jjnr[jidx+1];
2236 jnrlistC = jjnr[jidx+2];
2237 jnrlistD = jjnr[jidx+3];
2238 jnrlistE = jjnr[jidx+4];
2239 jnrlistF = jjnr[jidx+5];
2240 jnrlistG = jjnr[jidx+6];
2241 jnrlistH = jjnr[jidx+7];
2242 /* Sign of each element will be negative for non-real atoms.
2243 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2244 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2246 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2247 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2249 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2250 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2251 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2252 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2253 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2254 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2255 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2256 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2257 j_coord_offsetA = DIM*jnrA;
2258 j_coord_offsetB = DIM*jnrB;
2259 j_coord_offsetC = DIM*jnrC;
2260 j_coord_offsetD = DIM*jnrD;
2261 j_coord_offsetE = DIM*jnrE;
2262 j_coord_offsetF = DIM*jnrF;
2263 j_coord_offsetG = DIM*jnrG;
2264 j_coord_offsetH = DIM*jnrH;
2266 /* load j atom coordinates */
2267 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2268 x+j_coord_offsetC,x+j_coord_offsetD,
2269 x+j_coord_offsetE,x+j_coord_offsetF,
2270 x+j_coord_offsetG,x+j_coord_offsetH,
2271 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2273 /* Calculate displacement vector */
2274 dx00 = _mm256_sub_ps(ix0,jx0);
2275 dy00 = _mm256_sub_ps(iy0,jy0);
2276 dz00 = _mm256_sub_ps(iz0,jz0);
2277 dx01 = _mm256_sub_ps(ix0,jx1);
2278 dy01 = _mm256_sub_ps(iy0,jy1);
2279 dz01 = _mm256_sub_ps(iz0,jz1);
2280 dx02 = _mm256_sub_ps(ix0,jx2);
2281 dy02 = _mm256_sub_ps(iy0,jy2);
2282 dz02 = _mm256_sub_ps(iz0,jz2);
2283 dx10 = _mm256_sub_ps(ix1,jx0);
2284 dy10 = _mm256_sub_ps(iy1,jy0);
2285 dz10 = _mm256_sub_ps(iz1,jz0);
2286 dx11 = _mm256_sub_ps(ix1,jx1);
2287 dy11 = _mm256_sub_ps(iy1,jy1);
2288 dz11 = _mm256_sub_ps(iz1,jz1);
2289 dx12 = _mm256_sub_ps(ix1,jx2);
2290 dy12 = _mm256_sub_ps(iy1,jy2);
2291 dz12 = _mm256_sub_ps(iz1,jz2);
2292 dx20 = _mm256_sub_ps(ix2,jx0);
2293 dy20 = _mm256_sub_ps(iy2,jy0);
2294 dz20 = _mm256_sub_ps(iz2,jz0);
2295 dx21 = _mm256_sub_ps(ix2,jx1);
2296 dy21 = _mm256_sub_ps(iy2,jy1);
2297 dz21 = _mm256_sub_ps(iz2,jz1);
2298 dx22 = _mm256_sub_ps(ix2,jx2);
2299 dy22 = _mm256_sub_ps(iy2,jy2);
2300 dz22 = _mm256_sub_ps(iz2,jz2);
2302 /* Calculate squared distance and things based on it */
2303 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2304 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2305 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2306 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2307 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2308 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2309 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2310 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2311 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2313 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2314 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
2315 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
2316 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
2317 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2318 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2319 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
2320 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2321 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2323 fjx0 = _mm256_setzero_ps();
2324 fjy0 = _mm256_setzero_ps();
2325 fjz0 = _mm256_setzero_ps();
2326 fjx1 = _mm256_setzero_ps();
2327 fjy1 = _mm256_setzero_ps();
2328 fjz1 = _mm256_setzero_ps();
2329 fjx2 = _mm256_setzero_ps();
2330 fjy2 = _mm256_setzero_ps();
2331 fjz2 = _mm256_setzero_ps();
2333 /**************************
2334 * CALCULATE INTERACTIONS *
2335 **************************/
2337 r00 = _mm256_mul_ps(rsq00,rinv00);
2338 r00 = _mm256_andnot_ps(dummy_mask,r00);
2340 /* Calculate table index by multiplying r with table scale and truncate to integer */
2341 rt = _mm256_mul_ps(r00,vftabscale);
2342 vfitab = _mm256_cvttps_epi32(rt);
2343 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2344 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2345 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2346 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2347 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2348 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2350 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2351 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2352 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2353 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2354 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2355 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2356 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2357 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2358 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2359 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2360 Heps = _mm256_mul_ps(vfeps,H);
2361 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2362 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2363 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2365 /* CUBIC SPLINE TABLE DISPERSION */
2366 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2367 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2368 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2369 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2370 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2371 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2372 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2373 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2374 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2375 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2376 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2377 Heps = _mm256_mul_ps(vfeps,H);
2378 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2379 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2380 fvdw6 = _mm256_mul_ps(c6_00,FF);
2382 /* CUBIC SPLINE TABLE REPULSION */
2383 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2384 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2385 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2386 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2387 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2388 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2389 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2390 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2391 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2392 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2393 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2394 Heps = _mm256_mul_ps(vfeps,H);
2395 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2396 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2397 fvdw12 = _mm256_mul_ps(c12_00,FF);
2398 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
2400 fscal = _mm256_add_ps(felec,fvdw);
2402 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2404 /* Calculate temporary vectorial force */
2405 tx = _mm256_mul_ps(fscal,dx00);
2406 ty = _mm256_mul_ps(fscal,dy00);
2407 tz = _mm256_mul_ps(fscal,dz00);
2409 /* Update vectorial force */
2410 fix0 = _mm256_add_ps(fix0,tx);
2411 fiy0 = _mm256_add_ps(fiy0,ty);
2412 fiz0 = _mm256_add_ps(fiz0,tz);
2414 fjx0 = _mm256_add_ps(fjx0,tx);
2415 fjy0 = _mm256_add_ps(fjy0,ty);
2416 fjz0 = _mm256_add_ps(fjz0,tz);
2418 /**************************
2419 * CALCULATE INTERACTIONS *
2420 **************************/
2422 r01 = _mm256_mul_ps(rsq01,rinv01);
2423 r01 = _mm256_andnot_ps(dummy_mask,r01);
2425 /* Calculate table index by multiplying r with table scale and truncate to integer */
2426 rt = _mm256_mul_ps(r01,vftabscale);
2427 vfitab = _mm256_cvttps_epi32(rt);
2428 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2429 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2430 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2431 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2432 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2433 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2435 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2436 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2437 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2438 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2439 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2440 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2441 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2442 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2443 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2444 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2445 Heps = _mm256_mul_ps(vfeps,H);
2446 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2447 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2448 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2452 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2454 /* Calculate temporary vectorial force */
2455 tx = _mm256_mul_ps(fscal,dx01);
2456 ty = _mm256_mul_ps(fscal,dy01);
2457 tz = _mm256_mul_ps(fscal,dz01);
2459 /* Update vectorial force */
2460 fix0 = _mm256_add_ps(fix0,tx);
2461 fiy0 = _mm256_add_ps(fiy0,ty);
2462 fiz0 = _mm256_add_ps(fiz0,tz);
2464 fjx1 = _mm256_add_ps(fjx1,tx);
2465 fjy1 = _mm256_add_ps(fjy1,ty);
2466 fjz1 = _mm256_add_ps(fjz1,tz);
2468 /**************************
2469 * CALCULATE INTERACTIONS *
2470 **************************/
2472 r02 = _mm256_mul_ps(rsq02,rinv02);
2473 r02 = _mm256_andnot_ps(dummy_mask,r02);
2475 /* Calculate table index by multiplying r with table scale and truncate to integer */
2476 rt = _mm256_mul_ps(r02,vftabscale);
2477 vfitab = _mm256_cvttps_epi32(rt);
2478 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2479 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2480 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2481 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2482 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2483 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2485 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2486 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2487 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2488 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2489 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2490 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2491 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2492 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2493 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2494 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2495 Heps = _mm256_mul_ps(vfeps,H);
2496 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2497 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2498 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2502 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2504 /* Calculate temporary vectorial force */
2505 tx = _mm256_mul_ps(fscal,dx02);
2506 ty = _mm256_mul_ps(fscal,dy02);
2507 tz = _mm256_mul_ps(fscal,dz02);
2509 /* Update vectorial force */
2510 fix0 = _mm256_add_ps(fix0,tx);
2511 fiy0 = _mm256_add_ps(fiy0,ty);
2512 fiz0 = _mm256_add_ps(fiz0,tz);
2514 fjx2 = _mm256_add_ps(fjx2,tx);
2515 fjy2 = _mm256_add_ps(fjy2,ty);
2516 fjz2 = _mm256_add_ps(fjz2,tz);
2518 /**************************
2519 * CALCULATE INTERACTIONS *
2520 **************************/
2522 r10 = _mm256_mul_ps(rsq10,rinv10);
2523 r10 = _mm256_andnot_ps(dummy_mask,r10);
2525 /* Calculate table index by multiplying r with table scale and truncate to integer */
2526 rt = _mm256_mul_ps(r10,vftabscale);
2527 vfitab = _mm256_cvttps_epi32(rt);
2528 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2529 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2530 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2531 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2532 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2533 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2535 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2536 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2537 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2538 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2539 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2540 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2541 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2542 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2543 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2544 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2545 Heps = _mm256_mul_ps(vfeps,H);
2546 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2547 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2548 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2552 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2554 /* Calculate temporary vectorial force */
2555 tx = _mm256_mul_ps(fscal,dx10);
2556 ty = _mm256_mul_ps(fscal,dy10);
2557 tz = _mm256_mul_ps(fscal,dz10);
2559 /* Update vectorial force */
2560 fix1 = _mm256_add_ps(fix1,tx);
2561 fiy1 = _mm256_add_ps(fiy1,ty);
2562 fiz1 = _mm256_add_ps(fiz1,tz);
2564 fjx0 = _mm256_add_ps(fjx0,tx);
2565 fjy0 = _mm256_add_ps(fjy0,ty);
2566 fjz0 = _mm256_add_ps(fjz0,tz);
2568 /**************************
2569 * CALCULATE INTERACTIONS *
2570 **************************/
2572 r11 = _mm256_mul_ps(rsq11,rinv11);
2573 r11 = _mm256_andnot_ps(dummy_mask,r11);
2575 /* Calculate table index by multiplying r with table scale and truncate to integer */
2576 rt = _mm256_mul_ps(r11,vftabscale);
2577 vfitab = _mm256_cvttps_epi32(rt);
2578 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2579 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2580 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2581 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2582 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2583 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2585 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2586 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2587 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2588 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2589 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2590 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2591 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2592 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2593 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2594 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2595 Heps = _mm256_mul_ps(vfeps,H);
2596 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2597 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2598 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2602 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2604 /* Calculate temporary vectorial force */
2605 tx = _mm256_mul_ps(fscal,dx11);
2606 ty = _mm256_mul_ps(fscal,dy11);
2607 tz = _mm256_mul_ps(fscal,dz11);
2609 /* Update vectorial force */
2610 fix1 = _mm256_add_ps(fix1,tx);
2611 fiy1 = _mm256_add_ps(fiy1,ty);
2612 fiz1 = _mm256_add_ps(fiz1,tz);
2614 fjx1 = _mm256_add_ps(fjx1,tx);
2615 fjy1 = _mm256_add_ps(fjy1,ty);
2616 fjz1 = _mm256_add_ps(fjz1,tz);
2618 /**************************
2619 * CALCULATE INTERACTIONS *
2620 **************************/
2622 r12 = _mm256_mul_ps(rsq12,rinv12);
2623 r12 = _mm256_andnot_ps(dummy_mask,r12);
2625 /* Calculate table index by multiplying r with table scale and truncate to integer */
2626 rt = _mm256_mul_ps(r12,vftabscale);
2627 vfitab = _mm256_cvttps_epi32(rt);
2628 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2629 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2630 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2631 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2632 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2633 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2635 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2636 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2637 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2638 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2639 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2640 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2641 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2642 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2643 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2644 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2645 Heps = _mm256_mul_ps(vfeps,H);
2646 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2647 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2648 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2652 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2654 /* Calculate temporary vectorial force */
2655 tx = _mm256_mul_ps(fscal,dx12);
2656 ty = _mm256_mul_ps(fscal,dy12);
2657 tz = _mm256_mul_ps(fscal,dz12);
2659 /* Update vectorial force */
2660 fix1 = _mm256_add_ps(fix1,tx);
2661 fiy1 = _mm256_add_ps(fiy1,ty);
2662 fiz1 = _mm256_add_ps(fiz1,tz);
2664 fjx2 = _mm256_add_ps(fjx2,tx);
2665 fjy2 = _mm256_add_ps(fjy2,ty);
2666 fjz2 = _mm256_add_ps(fjz2,tz);
2668 /**************************
2669 * CALCULATE INTERACTIONS *
2670 **************************/
2672 r20 = _mm256_mul_ps(rsq20,rinv20);
2673 r20 = _mm256_andnot_ps(dummy_mask,r20);
2675 /* Calculate table index by multiplying r with table scale and truncate to integer */
2676 rt = _mm256_mul_ps(r20,vftabscale);
2677 vfitab = _mm256_cvttps_epi32(rt);
2678 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2679 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2680 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2681 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2682 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2683 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2685 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2686 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2687 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2688 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2689 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2690 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2691 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2692 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2693 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2694 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2695 Heps = _mm256_mul_ps(vfeps,H);
2696 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2697 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2698 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2702 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2704 /* Calculate temporary vectorial force */
2705 tx = _mm256_mul_ps(fscal,dx20);
2706 ty = _mm256_mul_ps(fscal,dy20);
2707 tz = _mm256_mul_ps(fscal,dz20);
2709 /* Update vectorial force */
2710 fix2 = _mm256_add_ps(fix2,tx);
2711 fiy2 = _mm256_add_ps(fiy2,ty);
2712 fiz2 = _mm256_add_ps(fiz2,tz);
2714 fjx0 = _mm256_add_ps(fjx0,tx);
2715 fjy0 = _mm256_add_ps(fjy0,ty);
2716 fjz0 = _mm256_add_ps(fjz0,tz);
2718 /**************************
2719 * CALCULATE INTERACTIONS *
2720 **************************/
2722 r21 = _mm256_mul_ps(rsq21,rinv21);
2723 r21 = _mm256_andnot_ps(dummy_mask,r21);
2725 /* Calculate table index by multiplying r with table scale and truncate to integer */
2726 rt = _mm256_mul_ps(r21,vftabscale);
2727 vfitab = _mm256_cvttps_epi32(rt);
2728 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2729 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2730 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2731 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2732 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2733 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2735 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2736 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2737 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2738 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2739 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2740 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2741 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2742 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2743 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2744 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2745 Heps = _mm256_mul_ps(vfeps,H);
2746 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2747 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2748 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2752 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2754 /* Calculate temporary vectorial force */
2755 tx = _mm256_mul_ps(fscal,dx21);
2756 ty = _mm256_mul_ps(fscal,dy21);
2757 tz = _mm256_mul_ps(fscal,dz21);
2759 /* Update vectorial force */
2760 fix2 = _mm256_add_ps(fix2,tx);
2761 fiy2 = _mm256_add_ps(fiy2,ty);
2762 fiz2 = _mm256_add_ps(fiz2,tz);
2764 fjx1 = _mm256_add_ps(fjx1,tx);
2765 fjy1 = _mm256_add_ps(fjy1,ty);
2766 fjz1 = _mm256_add_ps(fjz1,tz);
2768 /**************************
2769 * CALCULATE INTERACTIONS *
2770 **************************/
2772 r22 = _mm256_mul_ps(rsq22,rinv22);
2773 r22 = _mm256_andnot_ps(dummy_mask,r22);
2775 /* Calculate table index by multiplying r with table scale and truncate to integer */
2776 rt = _mm256_mul_ps(r22,vftabscale);
2777 vfitab = _mm256_cvttps_epi32(rt);
2778 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2779 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2780 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2781 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2782 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2783 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2785 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2786 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2787 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2788 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2789 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2790 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2791 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2792 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2793 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2794 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2795 Heps = _mm256_mul_ps(vfeps,H);
2796 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2797 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2798 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2802 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2804 /* Calculate temporary vectorial force */
2805 tx = _mm256_mul_ps(fscal,dx22);
2806 ty = _mm256_mul_ps(fscal,dy22);
2807 tz = _mm256_mul_ps(fscal,dz22);
2809 /* Update vectorial force */
2810 fix2 = _mm256_add_ps(fix2,tx);
2811 fiy2 = _mm256_add_ps(fiy2,ty);
2812 fiz2 = _mm256_add_ps(fiz2,tz);
2814 fjx2 = _mm256_add_ps(fjx2,tx);
2815 fjy2 = _mm256_add_ps(fjy2,ty);
2816 fjz2 = _mm256_add_ps(fjz2,tz);
2818 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2819 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2820 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2821 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2822 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2823 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2824 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2825 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2827 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2828 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2830 /* Inner loop uses 382 flops */
2833 /* End of innermost loop */
2835 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2836 f+i_coord_offset,fshift+i_shift_offset);
2838 /* Increment number of inner iterations */
2839 inneriter += j_index_end - j_index_start;
2841 /* Outer loop uses 18 flops */
2844 /* Increment number of outer iterations */
2847 /* Update outer/inner flops */
2849 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);