2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: None
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
96 __m128i vfitab_lo,vfitab_hi;
97 __m128i ifour = _mm_set1_epi32(4);
98 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
100 __m256 dummy_mask,cutoff_mask;
101 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
102 __m256 one = _mm256_set1_ps(1.0);
103 __m256 two = _mm256_set1_ps(2.0);
109 jindex = nlist->jindex;
111 shiftidx = nlist->shift;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm256_set1_ps(fr->epsfac);
116 charge = mdatoms->chargeA;
118 vftab = kernel_data->table_elec->data;
119 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
121 /* Setup water-specific parameters */
122 inr = nlist->iinr[0];
123 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
124 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
125 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
127 jq0 = _mm256_set1_ps(charge[inr+0]);
128 jq1 = _mm256_set1_ps(charge[inr+1]);
129 jq2 = _mm256_set1_ps(charge[inr+2]);
130 qq00 = _mm256_mul_ps(iq0,jq0);
131 qq01 = _mm256_mul_ps(iq0,jq1);
132 qq02 = _mm256_mul_ps(iq0,jq2);
133 qq10 = _mm256_mul_ps(iq1,jq0);
134 qq11 = _mm256_mul_ps(iq1,jq1);
135 qq12 = _mm256_mul_ps(iq1,jq2);
136 qq20 = _mm256_mul_ps(iq2,jq0);
137 qq21 = _mm256_mul_ps(iq2,jq1);
138 qq22 = _mm256_mul_ps(iq2,jq2);
140 /* Avoid stupid compiler warnings */
141 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
154 for(iidx=0;iidx<4*DIM;iidx++)
159 /* Start outer loop over neighborlists */
160 for(iidx=0; iidx<nri; iidx++)
162 /* Load shift vector for this list */
163 i_shift_offset = DIM*shiftidx[iidx];
165 /* Load limits for loop over neighbors */
166 j_index_start = jindex[iidx];
167 j_index_end = jindex[iidx+1];
169 /* Get outer coordinate index */
171 i_coord_offset = DIM*inr;
173 /* Load i particle coords and add shift vector */
174 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
175 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
177 fix0 = _mm256_setzero_ps();
178 fiy0 = _mm256_setzero_ps();
179 fiz0 = _mm256_setzero_ps();
180 fix1 = _mm256_setzero_ps();
181 fiy1 = _mm256_setzero_ps();
182 fiz1 = _mm256_setzero_ps();
183 fix2 = _mm256_setzero_ps();
184 fiy2 = _mm256_setzero_ps();
185 fiz2 = _mm256_setzero_ps();
187 /* Reset potential sums */
188 velecsum = _mm256_setzero_ps();
190 /* Start inner kernel loop */
191 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
194 /* Get j neighbor index, and coordinate index */
203 j_coord_offsetA = DIM*jnrA;
204 j_coord_offsetB = DIM*jnrB;
205 j_coord_offsetC = DIM*jnrC;
206 j_coord_offsetD = DIM*jnrD;
207 j_coord_offsetE = DIM*jnrE;
208 j_coord_offsetF = DIM*jnrF;
209 j_coord_offsetG = DIM*jnrG;
210 j_coord_offsetH = DIM*jnrH;
212 /* load j atom coordinates */
213 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
214 x+j_coord_offsetC,x+j_coord_offsetD,
215 x+j_coord_offsetE,x+j_coord_offsetF,
216 x+j_coord_offsetG,x+j_coord_offsetH,
217 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
219 /* Calculate displacement vector */
220 dx00 = _mm256_sub_ps(ix0,jx0);
221 dy00 = _mm256_sub_ps(iy0,jy0);
222 dz00 = _mm256_sub_ps(iz0,jz0);
223 dx01 = _mm256_sub_ps(ix0,jx1);
224 dy01 = _mm256_sub_ps(iy0,jy1);
225 dz01 = _mm256_sub_ps(iz0,jz1);
226 dx02 = _mm256_sub_ps(ix0,jx2);
227 dy02 = _mm256_sub_ps(iy0,jy2);
228 dz02 = _mm256_sub_ps(iz0,jz2);
229 dx10 = _mm256_sub_ps(ix1,jx0);
230 dy10 = _mm256_sub_ps(iy1,jy0);
231 dz10 = _mm256_sub_ps(iz1,jz0);
232 dx11 = _mm256_sub_ps(ix1,jx1);
233 dy11 = _mm256_sub_ps(iy1,jy1);
234 dz11 = _mm256_sub_ps(iz1,jz1);
235 dx12 = _mm256_sub_ps(ix1,jx2);
236 dy12 = _mm256_sub_ps(iy1,jy2);
237 dz12 = _mm256_sub_ps(iz1,jz2);
238 dx20 = _mm256_sub_ps(ix2,jx0);
239 dy20 = _mm256_sub_ps(iy2,jy0);
240 dz20 = _mm256_sub_ps(iz2,jz0);
241 dx21 = _mm256_sub_ps(ix2,jx1);
242 dy21 = _mm256_sub_ps(iy2,jy1);
243 dz21 = _mm256_sub_ps(iz2,jz1);
244 dx22 = _mm256_sub_ps(ix2,jx2);
245 dy22 = _mm256_sub_ps(iy2,jy2);
246 dz22 = _mm256_sub_ps(iz2,jz2);
248 /* Calculate squared distance and things based on it */
249 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
250 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
251 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
252 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
253 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
254 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
255 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
256 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
257 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
259 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
260 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
261 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
262 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
263 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
264 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
265 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
266 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
267 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
269 fjx0 = _mm256_setzero_ps();
270 fjy0 = _mm256_setzero_ps();
271 fjz0 = _mm256_setzero_ps();
272 fjx1 = _mm256_setzero_ps();
273 fjy1 = _mm256_setzero_ps();
274 fjz1 = _mm256_setzero_ps();
275 fjx2 = _mm256_setzero_ps();
276 fjy2 = _mm256_setzero_ps();
277 fjz2 = _mm256_setzero_ps();
279 /**************************
280 * CALCULATE INTERACTIONS *
281 **************************/
283 r00 = _mm256_mul_ps(rsq00,rinv00);
285 /* Calculate table index by multiplying r with table scale and truncate to integer */
286 rt = _mm256_mul_ps(r00,vftabscale);
287 vfitab = _mm256_cvttps_epi32(rt);
288 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
289 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
290 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
291 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
292 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
293 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
295 /* CUBIC SPLINE TABLE ELECTROSTATICS */
296 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
297 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
298 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
299 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
300 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
301 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
302 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
303 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
304 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
305 Heps = _mm256_mul_ps(vfeps,H);
306 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
307 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
308 velec = _mm256_mul_ps(qq00,VV);
309 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
310 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
312 /* Update potential sum for this i atom from the interaction with this j atom. */
313 velecsum = _mm256_add_ps(velecsum,velec);
317 /* Calculate temporary vectorial force */
318 tx = _mm256_mul_ps(fscal,dx00);
319 ty = _mm256_mul_ps(fscal,dy00);
320 tz = _mm256_mul_ps(fscal,dz00);
322 /* Update vectorial force */
323 fix0 = _mm256_add_ps(fix0,tx);
324 fiy0 = _mm256_add_ps(fiy0,ty);
325 fiz0 = _mm256_add_ps(fiz0,tz);
327 fjx0 = _mm256_add_ps(fjx0,tx);
328 fjy0 = _mm256_add_ps(fjy0,ty);
329 fjz0 = _mm256_add_ps(fjz0,tz);
331 /**************************
332 * CALCULATE INTERACTIONS *
333 **************************/
335 r01 = _mm256_mul_ps(rsq01,rinv01);
337 /* Calculate table index by multiplying r with table scale and truncate to integer */
338 rt = _mm256_mul_ps(r01,vftabscale);
339 vfitab = _mm256_cvttps_epi32(rt);
340 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
341 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
342 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
343 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
344 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
345 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
347 /* CUBIC SPLINE TABLE ELECTROSTATICS */
348 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
349 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
350 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
351 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
352 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
353 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
354 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
355 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
356 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
357 Heps = _mm256_mul_ps(vfeps,H);
358 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
359 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
360 velec = _mm256_mul_ps(qq01,VV);
361 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
362 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
364 /* Update potential sum for this i atom from the interaction with this j atom. */
365 velecsum = _mm256_add_ps(velecsum,velec);
369 /* Calculate temporary vectorial force */
370 tx = _mm256_mul_ps(fscal,dx01);
371 ty = _mm256_mul_ps(fscal,dy01);
372 tz = _mm256_mul_ps(fscal,dz01);
374 /* Update vectorial force */
375 fix0 = _mm256_add_ps(fix0,tx);
376 fiy0 = _mm256_add_ps(fiy0,ty);
377 fiz0 = _mm256_add_ps(fiz0,tz);
379 fjx1 = _mm256_add_ps(fjx1,tx);
380 fjy1 = _mm256_add_ps(fjy1,ty);
381 fjz1 = _mm256_add_ps(fjz1,tz);
383 /**************************
384 * CALCULATE INTERACTIONS *
385 **************************/
387 r02 = _mm256_mul_ps(rsq02,rinv02);
389 /* Calculate table index by multiplying r with table scale and truncate to integer */
390 rt = _mm256_mul_ps(r02,vftabscale);
391 vfitab = _mm256_cvttps_epi32(rt);
392 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
393 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
394 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
395 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
396 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
397 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
399 /* CUBIC SPLINE TABLE ELECTROSTATICS */
400 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
401 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
402 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
403 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
404 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
405 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
406 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
407 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
408 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
409 Heps = _mm256_mul_ps(vfeps,H);
410 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
411 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
412 velec = _mm256_mul_ps(qq02,VV);
413 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
414 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
416 /* Update potential sum for this i atom from the interaction with this j atom. */
417 velecsum = _mm256_add_ps(velecsum,velec);
421 /* Calculate temporary vectorial force */
422 tx = _mm256_mul_ps(fscal,dx02);
423 ty = _mm256_mul_ps(fscal,dy02);
424 tz = _mm256_mul_ps(fscal,dz02);
426 /* Update vectorial force */
427 fix0 = _mm256_add_ps(fix0,tx);
428 fiy0 = _mm256_add_ps(fiy0,ty);
429 fiz0 = _mm256_add_ps(fiz0,tz);
431 fjx2 = _mm256_add_ps(fjx2,tx);
432 fjy2 = _mm256_add_ps(fjy2,ty);
433 fjz2 = _mm256_add_ps(fjz2,tz);
435 /**************************
436 * CALCULATE INTERACTIONS *
437 **************************/
439 r10 = _mm256_mul_ps(rsq10,rinv10);
441 /* Calculate table index by multiplying r with table scale and truncate to integer */
442 rt = _mm256_mul_ps(r10,vftabscale);
443 vfitab = _mm256_cvttps_epi32(rt);
444 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
445 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
446 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
447 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
448 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
449 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
451 /* CUBIC SPLINE TABLE ELECTROSTATICS */
452 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
453 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
454 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
455 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
456 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
457 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
458 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
459 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
460 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
461 Heps = _mm256_mul_ps(vfeps,H);
462 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
463 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
464 velec = _mm256_mul_ps(qq10,VV);
465 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
466 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
468 /* Update potential sum for this i atom from the interaction with this j atom. */
469 velecsum = _mm256_add_ps(velecsum,velec);
473 /* Calculate temporary vectorial force */
474 tx = _mm256_mul_ps(fscal,dx10);
475 ty = _mm256_mul_ps(fscal,dy10);
476 tz = _mm256_mul_ps(fscal,dz10);
478 /* Update vectorial force */
479 fix1 = _mm256_add_ps(fix1,tx);
480 fiy1 = _mm256_add_ps(fiy1,ty);
481 fiz1 = _mm256_add_ps(fiz1,tz);
483 fjx0 = _mm256_add_ps(fjx0,tx);
484 fjy0 = _mm256_add_ps(fjy0,ty);
485 fjz0 = _mm256_add_ps(fjz0,tz);
487 /**************************
488 * CALCULATE INTERACTIONS *
489 **************************/
491 r11 = _mm256_mul_ps(rsq11,rinv11);
493 /* Calculate table index by multiplying r with table scale and truncate to integer */
494 rt = _mm256_mul_ps(r11,vftabscale);
495 vfitab = _mm256_cvttps_epi32(rt);
496 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
497 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
498 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
499 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
500 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
501 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
503 /* CUBIC SPLINE TABLE ELECTROSTATICS */
504 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
505 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
506 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
507 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
508 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
509 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
510 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
511 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
512 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
513 Heps = _mm256_mul_ps(vfeps,H);
514 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
515 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
516 velec = _mm256_mul_ps(qq11,VV);
517 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
518 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
520 /* Update potential sum for this i atom from the interaction with this j atom. */
521 velecsum = _mm256_add_ps(velecsum,velec);
525 /* Calculate temporary vectorial force */
526 tx = _mm256_mul_ps(fscal,dx11);
527 ty = _mm256_mul_ps(fscal,dy11);
528 tz = _mm256_mul_ps(fscal,dz11);
530 /* Update vectorial force */
531 fix1 = _mm256_add_ps(fix1,tx);
532 fiy1 = _mm256_add_ps(fiy1,ty);
533 fiz1 = _mm256_add_ps(fiz1,tz);
535 fjx1 = _mm256_add_ps(fjx1,tx);
536 fjy1 = _mm256_add_ps(fjy1,ty);
537 fjz1 = _mm256_add_ps(fjz1,tz);
539 /**************************
540 * CALCULATE INTERACTIONS *
541 **************************/
543 r12 = _mm256_mul_ps(rsq12,rinv12);
545 /* Calculate table index by multiplying r with table scale and truncate to integer */
546 rt = _mm256_mul_ps(r12,vftabscale);
547 vfitab = _mm256_cvttps_epi32(rt);
548 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
549 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
550 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
551 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
552 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
553 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
555 /* CUBIC SPLINE TABLE ELECTROSTATICS */
556 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
557 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
558 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
559 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
560 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
561 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
562 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
563 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
564 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
565 Heps = _mm256_mul_ps(vfeps,H);
566 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
567 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
568 velec = _mm256_mul_ps(qq12,VV);
569 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
570 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
572 /* Update potential sum for this i atom from the interaction with this j atom. */
573 velecsum = _mm256_add_ps(velecsum,velec);
577 /* Calculate temporary vectorial force */
578 tx = _mm256_mul_ps(fscal,dx12);
579 ty = _mm256_mul_ps(fscal,dy12);
580 tz = _mm256_mul_ps(fscal,dz12);
582 /* Update vectorial force */
583 fix1 = _mm256_add_ps(fix1,tx);
584 fiy1 = _mm256_add_ps(fiy1,ty);
585 fiz1 = _mm256_add_ps(fiz1,tz);
587 fjx2 = _mm256_add_ps(fjx2,tx);
588 fjy2 = _mm256_add_ps(fjy2,ty);
589 fjz2 = _mm256_add_ps(fjz2,tz);
591 /**************************
592 * CALCULATE INTERACTIONS *
593 **************************/
595 r20 = _mm256_mul_ps(rsq20,rinv20);
597 /* Calculate table index by multiplying r with table scale and truncate to integer */
598 rt = _mm256_mul_ps(r20,vftabscale);
599 vfitab = _mm256_cvttps_epi32(rt);
600 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
601 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
602 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
603 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
604 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
605 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
607 /* CUBIC SPLINE TABLE ELECTROSTATICS */
608 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
609 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
610 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
611 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
612 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
613 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
614 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
615 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
616 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
617 Heps = _mm256_mul_ps(vfeps,H);
618 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
619 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
620 velec = _mm256_mul_ps(qq20,VV);
621 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
622 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
624 /* Update potential sum for this i atom from the interaction with this j atom. */
625 velecsum = _mm256_add_ps(velecsum,velec);
629 /* Calculate temporary vectorial force */
630 tx = _mm256_mul_ps(fscal,dx20);
631 ty = _mm256_mul_ps(fscal,dy20);
632 tz = _mm256_mul_ps(fscal,dz20);
634 /* Update vectorial force */
635 fix2 = _mm256_add_ps(fix2,tx);
636 fiy2 = _mm256_add_ps(fiy2,ty);
637 fiz2 = _mm256_add_ps(fiz2,tz);
639 fjx0 = _mm256_add_ps(fjx0,tx);
640 fjy0 = _mm256_add_ps(fjy0,ty);
641 fjz0 = _mm256_add_ps(fjz0,tz);
643 /**************************
644 * CALCULATE INTERACTIONS *
645 **************************/
647 r21 = _mm256_mul_ps(rsq21,rinv21);
649 /* Calculate table index by multiplying r with table scale and truncate to integer */
650 rt = _mm256_mul_ps(r21,vftabscale);
651 vfitab = _mm256_cvttps_epi32(rt);
652 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
653 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
654 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
655 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
656 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
657 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
659 /* CUBIC SPLINE TABLE ELECTROSTATICS */
660 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
661 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
662 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
663 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
664 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
665 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
666 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
667 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
668 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
669 Heps = _mm256_mul_ps(vfeps,H);
670 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
671 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
672 velec = _mm256_mul_ps(qq21,VV);
673 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
674 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
676 /* Update potential sum for this i atom from the interaction with this j atom. */
677 velecsum = _mm256_add_ps(velecsum,velec);
681 /* Calculate temporary vectorial force */
682 tx = _mm256_mul_ps(fscal,dx21);
683 ty = _mm256_mul_ps(fscal,dy21);
684 tz = _mm256_mul_ps(fscal,dz21);
686 /* Update vectorial force */
687 fix2 = _mm256_add_ps(fix2,tx);
688 fiy2 = _mm256_add_ps(fiy2,ty);
689 fiz2 = _mm256_add_ps(fiz2,tz);
691 fjx1 = _mm256_add_ps(fjx1,tx);
692 fjy1 = _mm256_add_ps(fjy1,ty);
693 fjz1 = _mm256_add_ps(fjz1,tz);
695 /**************************
696 * CALCULATE INTERACTIONS *
697 **************************/
699 r22 = _mm256_mul_ps(rsq22,rinv22);
701 /* Calculate table index by multiplying r with table scale and truncate to integer */
702 rt = _mm256_mul_ps(r22,vftabscale);
703 vfitab = _mm256_cvttps_epi32(rt);
704 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
705 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
706 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
707 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
708 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
709 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
711 /* CUBIC SPLINE TABLE ELECTROSTATICS */
712 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
713 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
714 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
715 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
716 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
717 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
718 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
719 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
720 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
721 Heps = _mm256_mul_ps(vfeps,H);
722 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
723 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
724 velec = _mm256_mul_ps(qq22,VV);
725 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
726 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
728 /* Update potential sum for this i atom from the interaction with this j atom. */
729 velecsum = _mm256_add_ps(velecsum,velec);
733 /* Calculate temporary vectorial force */
734 tx = _mm256_mul_ps(fscal,dx22);
735 ty = _mm256_mul_ps(fscal,dy22);
736 tz = _mm256_mul_ps(fscal,dz22);
738 /* Update vectorial force */
739 fix2 = _mm256_add_ps(fix2,tx);
740 fiy2 = _mm256_add_ps(fiy2,ty);
741 fiz2 = _mm256_add_ps(fiz2,tz);
743 fjx2 = _mm256_add_ps(fjx2,tx);
744 fjy2 = _mm256_add_ps(fjy2,ty);
745 fjz2 = _mm256_add_ps(fjz2,tz);
747 fjptrA = f+j_coord_offsetA;
748 fjptrB = f+j_coord_offsetB;
749 fjptrC = f+j_coord_offsetC;
750 fjptrD = f+j_coord_offsetD;
751 fjptrE = f+j_coord_offsetE;
752 fjptrF = f+j_coord_offsetF;
753 fjptrG = f+j_coord_offsetG;
754 fjptrH = f+j_coord_offsetH;
756 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
757 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
759 /* Inner loop uses 387 flops */
765 /* Get j neighbor index, and coordinate index */
766 jnrlistA = jjnr[jidx];
767 jnrlistB = jjnr[jidx+1];
768 jnrlistC = jjnr[jidx+2];
769 jnrlistD = jjnr[jidx+3];
770 jnrlistE = jjnr[jidx+4];
771 jnrlistF = jjnr[jidx+5];
772 jnrlistG = jjnr[jidx+6];
773 jnrlistH = jjnr[jidx+7];
774 /* Sign of each element will be negative for non-real atoms.
775 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
776 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
778 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
779 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
781 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
782 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
783 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
784 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
785 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
786 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
787 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
788 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
789 j_coord_offsetA = DIM*jnrA;
790 j_coord_offsetB = DIM*jnrB;
791 j_coord_offsetC = DIM*jnrC;
792 j_coord_offsetD = DIM*jnrD;
793 j_coord_offsetE = DIM*jnrE;
794 j_coord_offsetF = DIM*jnrF;
795 j_coord_offsetG = DIM*jnrG;
796 j_coord_offsetH = DIM*jnrH;
798 /* load j atom coordinates */
799 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
800 x+j_coord_offsetC,x+j_coord_offsetD,
801 x+j_coord_offsetE,x+j_coord_offsetF,
802 x+j_coord_offsetG,x+j_coord_offsetH,
803 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
805 /* Calculate displacement vector */
806 dx00 = _mm256_sub_ps(ix0,jx0);
807 dy00 = _mm256_sub_ps(iy0,jy0);
808 dz00 = _mm256_sub_ps(iz0,jz0);
809 dx01 = _mm256_sub_ps(ix0,jx1);
810 dy01 = _mm256_sub_ps(iy0,jy1);
811 dz01 = _mm256_sub_ps(iz0,jz1);
812 dx02 = _mm256_sub_ps(ix0,jx2);
813 dy02 = _mm256_sub_ps(iy0,jy2);
814 dz02 = _mm256_sub_ps(iz0,jz2);
815 dx10 = _mm256_sub_ps(ix1,jx0);
816 dy10 = _mm256_sub_ps(iy1,jy0);
817 dz10 = _mm256_sub_ps(iz1,jz0);
818 dx11 = _mm256_sub_ps(ix1,jx1);
819 dy11 = _mm256_sub_ps(iy1,jy1);
820 dz11 = _mm256_sub_ps(iz1,jz1);
821 dx12 = _mm256_sub_ps(ix1,jx2);
822 dy12 = _mm256_sub_ps(iy1,jy2);
823 dz12 = _mm256_sub_ps(iz1,jz2);
824 dx20 = _mm256_sub_ps(ix2,jx0);
825 dy20 = _mm256_sub_ps(iy2,jy0);
826 dz20 = _mm256_sub_ps(iz2,jz0);
827 dx21 = _mm256_sub_ps(ix2,jx1);
828 dy21 = _mm256_sub_ps(iy2,jy1);
829 dz21 = _mm256_sub_ps(iz2,jz1);
830 dx22 = _mm256_sub_ps(ix2,jx2);
831 dy22 = _mm256_sub_ps(iy2,jy2);
832 dz22 = _mm256_sub_ps(iz2,jz2);
834 /* Calculate squared distance and things based on it */
835 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
836 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
837 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
838 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
839 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
840 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
841 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
842 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
843 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
845 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
846 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
847 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
848 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
849 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
850 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
851 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
852 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
853 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
855 fjx0 = _mm256_setzero_ps();
856 fjy0 = _mm256_setzero_ps();
857 fjz0 = _mm256_setzero_ps();
858 fjx1 = _mm256_setzero_ps();
859 fjy1 = _mm256_setzero_ps();
860 fjz1 = _mm256_setzero_ps();
861 fjx2 = _mm256_setzero_ps();
862 fjy2 = _mm256_setzero_ps();
863 fjz2 = _mm256_setzero_ps();
865 /**************************
866 * CALCULATE INTERACTIONS *
867 **************************/
869 r00 = _mm256_mul_ps(rsq00,rinv00);
870 r00 = _mm256_andnot_ps(dummy_mask,r00);
872 /* Calculate table index by multiplying r with table scale and truncate to integer */
873 rt = _mm256_mul_ps(r00,vftabscale);
874 vfitab = _mm256_cvttps_epi32(rt);
875 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
876 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
877 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
878 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
879 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
880 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
882 /* CUBIC SPLINE TABLE ELECTROSTATICS */
883 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
884 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
885 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
886 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
887 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
888 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
889 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
890 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
891 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
892 Heps = _mm256_mul_ps(vfeps,H);
893 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
894 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
895 velec = _mm256_mul_ps(qq00,VV);
896 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
897 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
899 /* Update potential sum for this i atom from the interaction with this j atom. */
900 velec = _mm256_andnot_ps(dummy_mask,velec);
901 velecsum = _mm256_add_ps(velecsum,velec);
905 fscal = _mm256_andnot_ps(dummy_mask,fscal);
907 /* Calculate temporary vectorial force */
908 tx = _mm256_mul_ps(fscal,dx00);
909 ty = _mm256_mul_ps(fscal,dy00);
910 tz = _mm256_mul_ps(fscal,dz00);
912 /* Update vectorial force */
913 fix0 = _mm256_add_ps(fix0,tx);
914 fiy0 = _mm256_add_ps(fiy0,ty);
915 fiz0 = _mm256_add_ps(fiz0,tz);
917 fjx0 = _mm256_add_ps(fjx0,tx);
918 fjy0 = _mm256_add_ps(fjy0,ty);
919 fjz0 = _mm256_add_ps(fjz0,tz);
921 /**************************
922 * CALCULATE INTERACTIONS *
923 **************************/
925 r01 = _mm256_mul_ps(rsq01,rinv01);
926 r01 = _mm256_andnot_ps(dummy_mask,r01);
928 /* Calculate table index by multiplying r with table scale and truncate to integer */
929 rt = _mm256_mul_ps(r01,vftabscale);
930 vfitab = _mm256_cvttps_epi32(rt);
931 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
932 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
933 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
934 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
935 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
936 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
938 /* CUBIC SPLINE TABLE ELECTROSTATICS */
939 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
940 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
941 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
942 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
943 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
944 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
945 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
946 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
947 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
948 Heps = _mm256_mul_ps(vfeps,H);
949 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
950 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
951 velec = _mm256_mul_ps(qq01,VV);
952 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
953 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
955 /* Update potential sum for this i atom from the interaction with this j atom. */
956 velec = _mm256_andnot_ps(dummy_mask,velec);
957 velecsum = _mm256_add_ps(velecsum,velec);
961 fscal = _mm256_andnot_ps(dummy_mask,fscal);
963 /* Calculate temporary vectorial force */
964 tx = _mm256_mul_ps(fscal,dx01);
965 ty = _mm256_mul_ps(fscal,dy01);
966 tz = _mm256_mul_ps(fscal,dz01);
968 /* Update vectorial force */
969 fix0 = _mm256_add_ps(fix0,tx);
970 fiy0 = _mm256_add_ps(fiy0,ty);
971 fiz0 = _mm256_add_ps(fiz0,tz);
973 fjx1 = _mm256_add_ps(fjx1,tx);
974 fjy1 = _mm256_add_ps(fjy1,ty);
975 fjz1 = _mm256_add_ps(fjz1,tz);
977 /**************************
978 * CALCULATE INTERACTIONS *
979 **************************/
981 r02 = _mm256_mul_ps(rsq02,rinv02);
982 r02 = _mm256_andnot_ps(dummy_mask,r02);
984 /* Calculate table index by multiplying r with table scale and truncate to integer */
985 rt = _mm256_mul_ps(r02,vftabscale);
986 vfitab = _mm256_cvttps_epi32(rt);
987 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
988 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
989 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
990 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
991 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
992 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
994 /* CUBIC SPLINE TABLE ELECTROSTATICS */
995 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
996 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
997 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
998 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
999 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1000 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1001 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1002 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1003 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1004 Heps = _mm256_mul_ps(vfeps,H);
1005 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1006 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1007 velec = _mm256_mul_ps(qq02,VV);
1008 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1009 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1011 /* Update potential sum for this i atom from the interaction with this j atom. */
1012 velec = _mm256_andnot_ps(dummy_mask,velec);
1013 velecsum = _mm256_add_ps(velecsum,velec);
1017 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1019 /* Calculate temporary vectorial force */
1020 tx = _mm256_mul_ps(fscal,dx02);
1021 ty = _mm256_mul_ps(fscal,dy02);
1022 tz = _mm256_mul_ps(fscal,dz02);
1024 /* Update vectorial force */
1025 fix0 = _mm256_add_ps(fix0,tx);
1026 fiy0 = _mm256_add_ps(fiy0,ty);
1027 fiz0 = _mm256_add_ps(fiz0,tz);
1029 fjx2 = _mm256_add_ps(fjx2,tx);
1030 fjy2 = _mm256_add_ps(fjy2,ty);
1031 fjz2 = _mm256_add_ps(fjz2,tz);
1033 /**************************
1034 * CALCULATE INTERACTIONS *
1035 **************************/
1037 r10 = _mm256_mul_ps(rsq10,rinv10);
1038 r10 = _mm256_andnot_ps(dummy_mask,r10);
1040 /* Calculate table index by multiplying r with table scale and truncate to integer */
1041 rt = _mm256_mul_ps(r10,vftabscale);
1042 vfitab = _mm256_cvttps_epi32(rt);
1043 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1044 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1045 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1046 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1047 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1048 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1050 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1051 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1052 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1053 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1054 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1055 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1056 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1057 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1058 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1059 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1060 Heps = _mm256_mul_ps(vfeps,H);
1061 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1062 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1063 velec = _mm256_mul_ps(qq10,VV);
1064 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1065 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1067 /* Update potential sum for this i atom from the interaction with this j atom. */
1068 velec = _mm256_andnot_ps(dummy_mask,velec);
1069 velecsum = _mm256_add_ps(velecsum,velec);
1073 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1075 /* Calculate temporary vectorial force */
1076 tx = _mm256_mul_ps(fscal,dx10);
1077 ty = _mm256_mul_ps(fscal,dy10);
1078 tz = _mm256_mul_ps(fscal,dz10);
1080 /* Update vectorial force */
1081 fix1 = _mm256_add_ps(fix1,tx);
1082 fiy1 = _mm256_add_ps(fiy1,ty);
1083 fiz1 = _mm256_add_ps(fiz1,tz);
1085 fjx0 = _mm256_add_ps(fjx0,tx);
1086 fjy0 = _mm256_add_ps(fjy0,ty);
1087 fjz0 = _mm256_add_ps(fjz0,tz);
1089 /**************************
1090 * CALCULATE INTERACTIONS *
1091 **************************/
1093 r11 = _mm256_mul_ps(rsq11,rinv11);
1094 r11 = _mm256_andnot_ps(dummy_mask,r11);
1096 /* Calculate table index by multiplying r with table scale and truncate to integer */
1097 rt = _mm256_mul_ps(r11,vftabscale);
1098 vfitab = _mm256_cvttps_epi32(rt);
1099 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1100 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1101 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1102 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1103 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1104 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1106 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1107 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1108 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1109 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1110 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1111 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1112 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1113 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1114 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1115 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1116 Heps = _mm256_mul_ps(vfeps,H);
1117 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1118 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1119 velec = _mm256_mul_ps(qq11,VV);
1120 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1121 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1123 /* Update potential sum for this i atom from the interaction with this j atom. */
1124 velec = _mm256_andnot_ps(dummy_mask,velec);
1125 velecsum = _mm256_add_ps(velecsum,velec);
1129 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1131 /* Calculate temporary vectorial force */
1132 tx = _mm256_mul_ps(fscal,dx11);
1133 ty = _mm256_mul_ps(fscal,dy11);
1134 tz = _mm256_mul_ps(fscal,dz11);
1136 /* Update vectorial force */
1137 fix1 = _mm256_add_ps(fix1,tx);
1138 fiy1 = _mm256_add_ps(fiy1,ty);
1139 fiz1 = _mm256_add_ps(fiz1,tz);
1141 fjx1 = _mm256_add_ps(fjx1,tx);
1142 fjy1 = _mm256_add_ps(fjy1,ty);
1143 fjz1 = _mm256_add_ps(fjz1,tz);
1145 /**************************
1146 * CALCULATE INTERACTIONS *
1147 **************************/
1149 r12 = _mm256_mul_ps(rsq12,rinv12);
1150 r12 = _mm256_andnot_ps(dummy_mask,r12);
1152 /* Calculate table index by multiplying r with table scale and truncate to integer */
1153 rt = _mm256_mul_ps(r12,vftabscale);
1154 vfitab = _mm256_cvttps_epi32(rt);
1155 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1156 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1157 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1158 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1159 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1160 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1162 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1163 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1164 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1165 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1166 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1167 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1168 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1169 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1170 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1171 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1172 Heps = _mm256_mul_ps(vfeps,H);
1173 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1174 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1175 velec = _mm256_mul_ps(qq12,VV);
1176 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1177 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1179 /* Update potential sum for this i atom from the interaction with this j atom. */
1180 velec = _mm256_andnot_ps(dummy_mask,velec);
1181 velecsum = _mm256_add_ps(velecsum,velec);
1185 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1187 /* Calculate temporary vectorial force */
1188 tx = _mm256_mul_ps(fscal,dx12);
1189 ty = _mm256_mul_ps(fscal,dy12);
1190 tz = _mm256_mul_ps(fscal,dz12);
1192 /* Update vectorial force */
1193 fix1 = _mm256_add_ps(fix1,tx);
1194 fiy1 = _mm256_add_ps(fiy1,ty);
1195 fiz1 = _mm256_add_ps(fiz1,tz);
1197 fjx2 = _mm256_add_ps(fjx2,tx);
1198 fjy2 = _mm256_add_ps(fjy2,ty);
1199 fjz2 = _mm256_add_ps(fjz2,tz);
1201 /**************************
1202 * CALCULATE INTERACTIONS *
1203 **************************/
1205 r20 = _mm256_mul_ps(rsq20,rinv20);
1206 r20 = _mm256_andnot_ps(dummy_mask,r20);
1208 /* Calculate table index by multiplying r with table scale and truncate to integer */
1209 rt = _mm256_mul_ps(r20,vftabscale);
1210 vfitab = _mm256_cvttps_epi32(rt);
1211 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1212 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1213 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1214 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1215 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1216 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1218 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1219 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1220 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1221 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1222 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1223 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1224 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1225 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1226 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1227 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1228 Heps = _mm256_mul_ps(vfeps,H);
1229 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1230 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1231 velec = _mm256_mul_ps(qq20,VV);
1232 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1233 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1235 /* Update potential sum for this i atom from the interaction with this j atom. */
1236 velec = _mm256_andnot_ps(dummy_mask,velec);
1237 velecsum = _mm256_add_ps(velecsum,velec);
1241 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1243 /* Calculate temporary vectorial force */
1244 tx = _mm256_mul_ps(fscal,dx20);
1245 ty = _mm256_mul_ps(fscal,dy20);
1246 tz = _mm256_mul_ps(fscal,dz20);
1248 /* Update vectorial force */
1249 fix2 = _mm256_add_ps(fix2,tx);
1250 fiy2 = _mm256_add_ps(fiy2,ty);
1251 fiz2 = _mm256_add_ps(fiz2,tz);
1253 fjx0 = _mm256_add_ps(fjx0,tx);
1254 fjy0 = _mm256_add_ps(fjy0,ty);
1255 fjz0 = _mm256_add_ps(fjz0,tz);
1257 /**************************
1258 * CALCULATE INTERACTIONS *
1259 **************************/
1261 r21 = _mm256_mul_ps(rsq21,rinv21);
1262 r21 = _mm256_andnot_ps(dummy_mask,r21);
1264 /* Calculate table index by multiplying r with table scale and truncate to integer */
1265 rt = _mm256_mul_ps(r21,vftabscale);
1266 vfitab = _mm256_cvttps_epi32(rt);
1267 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1268 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1269 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1270 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1271 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1272 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1274 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1275 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1276 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1277 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1278 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1279 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1280 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1281 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1282 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1283 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1284 Heps = _mm256_mul_ps(vfeps,H);
1285 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1286 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1287 velec = _mm256_mul_ps(qq21,VV);
1288 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1289 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1291 /* Update potential sum for this i atom from the interaction with this j atom. */
1292 velec = _mm256_andnot_ps(dummy_mask,velec);
1293 velecsum = _mm256_add_ps(velecsum,velec);
1297 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1299 /* Calculate temporary vectorial force */
1300 tx = _mm256_mul_ps(fscal,dx21);
1301 ty = _mm256_mul_ps(fscal,dy21);
1302 tz = _mm256_mul_ps(fscal,dz21);
1304 /* Update vectorial force */
1305 fix2 = _mm256_add_ps(fix2,tx);
1306 fiy2 = _mm256_add_ps(fiy2,ty);
1307 fiz2 = _mm256_add_ps(fiz2,tz);
1309 fjx1 = _mm256_add_ps(fjx1,tx);
1310 fjy1 = _mm256_add_ps(fjy1,ty);
1311 fjz1 = _mm256_add_ps(fjz1,tz);
1313 /**************************
1314 * CALCULATE INTERACTIONS *
1315 **************************/
1317 r22 = _mm256_mul_ps(rsq22,rinv22);
1318 r22 = _mm256_andnot_ps(dummy_mask,r22);
1320 /* Calculate table index by multiplying r with table scale and truncate to integer */
1321 rt = _mm256_mul_ps(r22,vftabscale);
1322 vfitab = _mm256_cvttps_epi32(rt);
1323 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1324 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1325 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1326 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1327 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1328 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1330 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1331 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1332 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1333 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1334 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1335 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1336 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1337 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1338 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1339 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1340 Heps = _mm256_mul_ps(vfeps,H);
1341 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1342 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1343 velec = _mm256_mul_ps(qq22,VV);
1344 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1345 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1347 /* Update potential sum for this i atom from the interaction with this j atom. */
1348 velec = _mm256_andnot_ps(dummy_mask,velec);
1349 velecsum = _mm256_add_ps(velecsum,velec);
1353 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1355 /* Calculate temporary vectorial force */
1356 tx = _mm256_mul_ps(fscal,dx22);
1357 ty = _mm256_mul_ps(fscal,dy22);
1358 tz = _mm256_mul_ps(fscal,dz22);
1360 /* Update vectorial force */
1361 fix2 = _mm256_add_ps(fix2,tx);
1362 fiy2 = _mm256_add_ps(fiy2,ty);
1363 fiz2 = _mm256_add_ps(fiz2,tz);
1365 fjx2 = _mm256_add_ps(fjx2,tx);
1366 fjy2 = _mm256_add_ps(fjy2,ty);
1367 fjz2 = _mm256_add_ps(fjz2,tz);
1369 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1370 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1371 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1372 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1373 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1374 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1375 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1376 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1378 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1379 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1381 /* Inner loop uses 396 flops */
1384 /* End of innermost loop */
1386 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1387 f+i_coord_offset,fshift+i_shift_offset);
1390 /* Update potential energies */
1391 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1393 /* Increment number of inner iterations */
1394 inneriter += j_index_end - j_index_start;
1396 /* Outer loop uses 19 flops */
1399 /* Increment number of outer iterations */
1402 /* Update outer/inner flops */
1404 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*396);
1407 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_single
1408 * Electrostatics interaction: CubicSplineTable
1409 * VdW interaction: None
1410 * Geometry: Water3-Water3
1411 * Calculate force/pot: Force
1414 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_single
1415 (t_nblist * gmx_restrict nlist,
1416 rvec * gmx_restrict xx,
1417 rvec * gmx_restrict ff,
1418 t_forcerec * gmx_restrict fr,
1419 t_mdatoms * gmx_restrict mdatoms,
1420 nb_kernel_data_t * gmx_restrict kernel_data,
1421 t_nrnb * gmx_restrict nrnb)
1423 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1424 * just 0 for non-waters.
1425 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1426 * jnr indices corresponding to data put in the four positions in the SIMD register.
1428 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1429 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1430 int jnrA,jnrB,jnrC,jnrD;
1431 int jnrE,jnrF,jnrG,jnrH;
1432 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1433 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1434 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1435 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1436 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1437 real rcutoff_scalar;
1438 real *shiftvec,*fshift,*x,*f;
1439 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1440 real scratch[4*DIM];
1441 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1442 real * vdwioffsetptr0;
1443 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1444 real * vdwioffsetptr1;
1445 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1446 real * vdwioffsetptr2;
1447 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1448 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1449 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1450 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1451 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1452 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1453 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1454 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1455 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1456 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1457 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1458 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1459 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1460 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1461 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1462 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1463 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1466 __m128i vfitab_lo,vfitab_hi;
1467 __m128i ifour = _mm_set1_epi32(4);
1468 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1470 __m256 dummy_mask,cutoff_mask;
1471 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1472 __m256 one = _mm256_set1_ps(1.0);
1473 __m256 two = _mm256_set1_ps(2.0);
1479 jindex = nlist->jindex;
1481 shiftidx = nlist->shift;
1483 shiftvec = fr->shift_vec[0];
1484 fshift = fr->fshift[0];
1485 facel = _mm256_set1_ps(fr->epsfac);
1486 charge = mdatoms->chargeA;
1488 vftab = kernel_data->table_elec->data;
1489 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
1491 /* Setup water-specific parameters */
1492 inr = nlist->iinr[0];
1493 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1494 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1495 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1497 jq0 = _mm256_set1_ps(charge[inr+0]);
1498 jq1 = _mm256_set1_ps(charge[inr+1]);
1499 jq2 = _mm256_set1_ps(charge[inr+2]);
1500 qq00 = _mm256_mul_ps(iq0,jq0);
1501 qq01 = _mm256_mul_ps(iq0,jq1);
1502 qq02 = _mm256_mul_ps(iq0,jq2);
1503 qq10 = _mm256_mul_ps(iq1,jq0);
1504 qq11 = _mm256_mul_ps(iq1,jq1);
1505 qq12 = _mm256_mul_ps(iq1,jq2);
1506 qq20 = _mm256_mul_ps(iq2,jq0);
1507 qq21 = _mm256_mul_ps(iq2,jq1);
1508 qq22 = _mm256_mul_ps(iq2,jq2);
1510 /* Avoid stupid compiler warnings */
1511 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1512 j_coord_offsetA = 0;
1513 j_coord_offsetB = 0;
1514 j_coord_offsetC = 0;
1515 j_coord_offsetD = 0;
1516 j_coord_offsetE = 0;
1517 j_coord_offsetF = 0;
1518 j_coord_offsetG = 0;
1519 j_coord_offsetH = 0;
1524 for(iidx=0;iidx<4*DIM;iidx++)
1526 scratch[iidx] = 0.0;
1529 /* Start outer loop over neighborlists */
1530 for(iidx=0; iidx<nri; iidx++)
1532 /* Load shift vector for this list */
1533 i_shift_offset = DIM*shiftidx[iidx];
1535 /* Load limits for loop over neighbors */
1536 j_index_start = jindex[iidx];
1537 j_index_end = jindex[iidx+1];
1539 /* Get outer coordinate index */
1541 i_coord_offset = DIM*inr;
1543 /* Load i particle coords and add shift vector */
1544 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1545 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1547 fix0 = _mm256_setzero_ps();
1548 fiy0 = _mm256_setzero_ps();
1549 fiz0 = _mm256_setzero_ps();
1550 fix1 = _mm256_setzero_ps();
1551 fiy1 = _mm256_setzero_ps();
1552 fiz1 = _mm256_setzero_ps();
1553 fix2 = _mm256_setzero_ps();
1554 fiy2 = _mm256_setzero_ps();
1555 fiz2 = _mm256_setzero_ps();
1557 /* Start inner kernel loop */
1558 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1561 /* Get j neighbor index, and coordinate index */
1563 jnrB = jjnr[jidx+1];
1564 jnrC = jjnr[jidx+2];
1565 jnrD = jjnr[jidx+3];
1566 jnrE = jjnr[jidx+4];
1567 jnrF = jjnr[jidx+5];
1568 jnrG = jjnr[jidx+6];
1569 jnrH = jjnr[jidx+7];
1570 j_coord_offsetA = DIM*jnrA;
1571 j_coord_offsetB = DIM*jnrB;
1572 j_coord_offsetC = DIM*jnrC;
1573 j_coord_offsetD = DIM*jnrD;
1574 j_coord_offsetE = DIM*jnrE;
1575 j_coord_offsetF = DIM*jnrF;
1576 j_coord_offsetG = DIM*jnrG;
1577 j_coord_offsetH = DIM*jnrH;
1579 /* load j atom coordinates */
1580 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1581 x+j_coord_offsetC,x+j_coord_offsetD,
1582 x+j_coord_offsetE,x+j_coord_offsetF,
1583 x+j_coord_offsetG,x+j_coord_offsetH,
1584 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1586 /* Calculate displacement vector */
1587 dx00 = _mm256_sub_ps(ix0,jx0);
1588 dy00 = _mm256_sub_ps(iy0,jy0);
1589 dz00 = _mm256_sub_ps(iz0,jz0);
1590 dx01 = _mm256_sub_ps(ix0,jx1);
1591 dy01 = _mm256_sub_ps(iy0,jy1);
1592 dz01 = _mm256_sub_ps(iz0,jz1);
1593 dx02 = _mm256_sub_ps(ix0,jx2);
1594 dy02 = _mm256_sub_ps(iy0,jy2);
1595 dz02 = _mm256_sub_ps(iz0,jz2);
1596 dx10 = _mm256_sub_ps(ix1,jx0);
1597 dy10 = _mm256_sub_ps(iy1,jy0);
1598 dz10 = _mm256_sub_ps(iz1,jz0);
1599 dx11 = _mm256_sub_ps(ix1,jx1);
1600 dy11 = _mm256_sub_ps(iy1,jy1);
1601 dz11 = _mm256_sub_ps(iz1,jz1);
1602 dx12 = _mm256_sub_ps(ix1,jx2);
1603 dy12 = _mm256_sub_ps(iy1,jy2);
1604 dz12 = _mm256_sub_ps(iz1,jz2);
1605 dx20 = _mm256_sub_ps(ix2,jx0);
1606 dy20 = _mm256_sub_ps(iy2,jy0);
1607 dz20 = _mm256_sub_ps(iz2,jz0);
1608 dx21 = _mm256_sub_ps(ix2,jx1);
1609 dy21 = _mm256_sub_ps(iy2,jy1);
1610 dz21 = _mm256_sub_ps(iz2,jz1);
1611 dx22 = _mm256_sub_ps(ix2,jx2);
1612 dy22 = _mm256_sub_ps(iy2,jy2);
1613 dz22 = _mm256_sub_ps(iz2,jz2);
1615 /* Calculate squared distance and things based on it */
1616 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1617 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1618 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1619 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1620 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1621 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1622 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1623 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1624 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1626 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1627 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1628 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1629 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1630 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1631 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1632 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1633 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1634 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1636 fjx0 = _mm256_setzero_ps();
1637 fjy0 = _mm256_setzero_ps();
1638 fjz0 = _mm256_setzero_ps();
1639 fjx1 = _mm256_setzero_ps();
1640 fjy1 = _mm256_setzero_ps();
1641 fjz1 = _mm256_setzero_ps();
1642 fjx2 = _mm256_setzero_ps();
1643 fjy2 = _mm256_setzero_ps();
1644 fjz2 = _mm256_setzero_ps();
1646 /**************************
1647 * CALCULATE INTERACTIONS *
1648 **************************/
1650 r00 = _mm256_mul_ps(rsq00,rinv00);
1652 /* Calculate table index by multiplying r with table scale and truncate to integer */
1653 rt = _mm256_mul_ps(r00,vftabscale);
1654 vfitab = _mm256_cvttps_epi32(rt);
1655 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1656 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1657 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1658 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1659 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1660 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1662 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1663 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1664 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1665 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1666 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1667 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1668 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1669 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1670 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1671 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1672 Heps = _mm256_mul_ps(vfeps,H);
1673 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1674 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1675 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1679 /* Calculate temporary vectorial force */
1680 tx = _mm256_mul_ps(fscal,dx00);
1681 ty = _mm256_mul_ps(fscal,dy00);
1682 tz = _mm256_mul_ps(fscal,dz00);
1684 /* Update vectorial force */
1685 fix0 = _mm256_add_ps(fix0,tx);
1686 fiy0 = _mm256_add_ps(fiy0,ty);
1687 fiz0 = _mm256_add_ps(fiz0,tz);
1689 fjx0 = _mm256_add_ps(fjx0,tx);
1690 fjy0 = _mm256_add_ps(fjy0,ty);
1691 fjz0 = _mm256_add_ps(fjz0,tz);
1693 /**************************
1694 * CALCULATE INTERACTIONS *
1695 **************************/
1697 r01 = _mm256_mul_ps(rsq01,rinv01);
1699 /* Calculate table index by multiplying r with table scale and truncate to integer */
1700 rt = _mm256_mul_ps(r01,vftabscale);
1701 vfitab = _mm256_cvttps_epi32(rt);
1702 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1703 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1704 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1705 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1706 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1707 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1709 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1710 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1711 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1712 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1713 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1714 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1715 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1716 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1717 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1718 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1719 Heps = _mm256_mul_ps(vfeps,H);
1720 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1721 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1722 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1726 /* Calculate temporary vectorial force */
1727 tx = _mm256_mul_ps(fscal,dx01);
1728 ty = _mm256_mul_ps(fscal,dy01);
1729 tz = _mm256_mul_ps(fscal,dz01);
1731 /* Update vectorial force */
1732 fix0 = _mm256_add_ps(fix0,tx);
1733 fiy0 = _mm256_add_ps(fiy0,ty);
1734 fiz0 = _mm256_add_ps(fiz0,tz);
1736 fjx1 = _mm256_add_ps(fjx1,tx);
1737 fjy1 = _mm256_add_ps(fjy1,ty);
1738 fjz1 = _mm256_add_ps(fjz1,tz);
1740 /**************************
1741 * CALCULATE INTERACTIONS *
1742 **************************/
1744 r02 = _mm256_mul_ps(rsq02,rinv02);
1746 /* Calculate table index by multiplying r with table scale and truncate to integer */
1747 rt = _mm256_mul_ps(r02,vftabscale);
1748 vfitab = _mm256_cvttps_epi32(rt);
1749 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1750 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1751 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1752 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1753 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1754 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1756 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1757 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1758 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1759 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1760 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1761 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1762 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1763 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1764 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1765 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1766 Heps = _mm256_mul_ps(vfeps,H);
1767 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1768 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1769 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1773 /* Calculate temporary vectorial force */
1774 tx = _mm256_mul_ps(fscal,dx02);
1775 ty = _mm256_mul_ps(fscal,dy02);
1776 tz = _mm256_mul_ps(fscal,dz02);
1778 /* Update vectorial force */
1779 fix0 = _mm256_add_ps(fix0,tx);
1780 fiy0 = _mm256_add_ps(fiy0,ty);
1781 fiz0 = _mm256_add_ps(fiz0,tz);
1783 fjx2 = _mm256_add_ps(fjx2,tx);
1784 fjy2 = _mm256_add_ps(fjy2,ty);
1785 fjz2 = _mm256_add_ps(fjz2,tz);
1787 /**************************
1788 * CALCULATE INTERACTIONS *
1789 **************************/
1791 r10 = _mm256_mul_ps(rsq10,rinv10);
1793 /* Calculate table index by multiplying r with table scale and truncate to integer */
1794 rt = _mm256_mul_ps(r10,vftabscale);
1795 vfitab = _mm256_cvttps_epi32(rt);
1796 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1797 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1798 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1799 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1800 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1801 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1803 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1804 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1805 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1806 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1807 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1808 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1809 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1810 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1811 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1812 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1813 Heps = _mm256_mul_ps(vfeps,H);
1814 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1815 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1816 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1820 /* Calculate temporary vectorial force */
1821 tx = _mm256_mul_ps(fscal,dx10);
1822 ty = _mm256_mul_ps(fscal,dy10);
1823 tz = _mm256_mul_ps(fscal,dz10);
1825 /* Update vectorial force */
1826 fix1 = _mm256_add_ps(fix1,tx);
1827 fiy1 = _mm256_add_ps(fiy1,ty);
1828 fiz1 = _mm256_add_ps(fiz1,tz);
1830 fjx0 = _mm256_add_ps(fjx0,tx);
1831 fjy0 = _mm256_add_ps(fjy0,ty);
1832 fjz0 = _mm256_add_ps(fjz0,tz);
1834 /**************************
1835 * CALCULATE INTERACTIONS *
1836 **************************/
1838 r11 = _mm256_mul_ps(rsq11,rinv11);
1840 /* Calculate table index by multiplying r with table scale and truncate to integer */
1841 rt = _mm256_mul_ps(r11,vftabscale);
1842 vfitab = _mm256_cvttps_epi32(rt);
1843 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1844 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1845 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1846 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1847 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1848 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1850 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1851 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1852 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1853 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1854 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1855 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1856 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1857 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1858 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1859 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1860 Heps = _mm256_mul_ps(vfeps,H);
1861 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1862 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1863 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1867 /* Calculate temporary vectorial force */
1868 tx = _mm256_mul_ps(fscal,dx11);
1869 ty = _mm256_mul_ps(fscal,dy11);
1870 tz = _mm256_mul_ps(fscal,dz11);
1872 /* Update vectorial force */
1873 fix1 = _mm256_add_ps(fix1,tx);
1874 fiy1 = _mm256_add_ps(fiy1,ty);
1875 fiz1 = _mm256_add_ps(fiz1,tz);
1877 fjx1 = _mm256_add_ps(fjx1,tx);
1878 fjy1 = _mm256_add_ps(fjy1,ty);
1879 fjz1 = _mm256_add_ps(fjz1,tz);
1881 /**************************
1882 * CALCULATE INTERACTIONS *
1883 **************************/
1885 r12 = _mm256_mul_ps(rsq12,rinv12);
1887 /* Calculate table index by multiplying r with table scale and truncate to integer */
1888 rt = _mm256_mul_ps(r12,vftabscale);
1889 vfitab = _mm256_cvttps_epi32(rt);
1890 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1891 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1892 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1893 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1894 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1895 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1897 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1898 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1899 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1900 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1901 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1902 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1903 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1904 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1905 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1906 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1907 Heps = _mm256_mul_ps(vfeps,H);
1908 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1909 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1910 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1914 /* Calculate temporary vectorial force */
1915 tx = _mm256_mul_ps(fscal,dx12);
1916 ty = _mm256_mul_ps(fscal,dy12);
1917 tz = _mm256_mul_ps(fscal,dz12);
1919 /* Update vectorial force */
1920 fix1 = _mm256_add_ps(fix1,tx);
1921 fiy1 = _mm256_add_ps(fiy1,ty);
1922 fiz1 = _mm256_add_ps(fiz1,tz);
1924 fjx2 = _mm256_add_ps(fjx2,tx);
1925 fjy2 = _mm256_add_ps(fjy2,ty);
1926 fjz2 = _mm256_add_ps(fjz2,tz);
1928 /**************************
1929 * CALCULATE INTERACTIONS *
1930 **************************/
1932 r20 = _mm256_mul_ps(rsq20,rinv20);
1934 /* Calculate table index by multiplying r with table scale and truncate to integer */
1935 rt = _mm256_mul_ps(r20,vftabscale);
1936 vfitab = _mm256_cvttps_epi32(rt);
1937 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1938 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1939 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1940 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1941 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1942 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1944 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1945 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1946 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1947 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1948 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1949 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1950 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1951 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1952 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1953 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1954 Heps = _mm256_mul_ps(vfeps,H);
1955 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1956 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1957 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1961 /* Calculate temporary vectorial force */
1962 tx = _mm256_mul_ps(fscal,dx20);
1963 ty = _mm256_mul_ps(fscal,dy20);
1964 tz = _mm256_mul_ps(fscal,dz20);
1966 /* Update vectorial force */
1967 fix2 = _mm256_add_ps(fix2,tx);
1968 fiy2 = _mm256_add_ps(fiy2,ty);
1969 fiz2 = _mm256_add_ps(fiz2,tz);
1971 fjx0 = _mm256_add_ps(fjx0,tx);
1972 fjy0 = _mm256_add_ps(fjy0,ty);
1973 fjz0 = _mm256_add_ps(fjz0,tz);
1975 /**************************
1976 * CALCULATE INTERACTIONS *
1977 **************************/
1979 r21 = _mm256_mul_ps(rsq21,rinv21);
1981 /* Calculate table index by multiplying r with table scale and truncate to integer */
1982 rt = _mm256_mul_ps(r21,vftabscale);
1983 vfitab = _mm256_cvttps_epi32(rt);
1984 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1985 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1986 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1987 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1988 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1989 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1991 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1992 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1993 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1994 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1995 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1996 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1997 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1998 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1999 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2000 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2001 Heps = _mm256_mul_ps(vfeps,H);
2002 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2003 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2004 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2008 /* Calculate temporary vectorial force */
2009 tx = _mm256_mul_ps(fscal,dx21);
2010 ty = _mm256_mul_ps(fscal,dy21);
2011 tz = _mm256_mul_ps(fscal,dz21);
2013 /* Update vectorial force */
2014 fix2 = _mm256_add_ps(fix2,tx);
2015 fiy2 = _mm256_add_ps(fiy2,ty);
2016 fiz2 = _mm256_add_ps(fiz2,tz);
2018 fjx1 = _mm256_add_ps(fjx1,tx);
2019 fjy1 = _mm256_add_ps(fjy1,ty);
2020 fjz1 = _mm256_add_ps(fjz1,tz);
2022 /**************************
2023 * CALCULATE INTERACTIONS *
2024 **************************/
2026 r22 = _mm256_mul_ps(rsq22,rinv22);
2028 /* Calculate table index by multiplying r with table scale and truncate to integer */
2029 rt = _mm256_mul_ps(r22,vftabscale);
2030 vfitab = _mm256_cvttps_epi32(rt);
2031 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2032 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2033 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2034 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2035 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2036 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2038 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2039 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2040 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2041 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2042 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2043 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2044 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2045 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2046 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2047 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2048 Heps = _mm256_mul_ps(vfeps,H);
2049 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2050 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2051 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2055 /* Calculate temporary vectorial force */
2056 tx = _mm256_mul_ps(fscal,dx22);
2057 ty = _mm256_mul_ps(fscal,dy22);
2058 tz = _mm256_mul_ps(fscal,dz22);
2060 /* Update vectorial force */
2061 fix2 = _mm256_add_ps(fix2,tx);
2062 fiy2 = _mm256_add_ps(fiy2,ty);
2063 fiz2 = _mm256_add_ps(fiz2,tz);
2065 fjx2 = _mm256_add_ps(fjx2,tx);
2066 fjy2 = _mm256_add_ps(fjy2,ty);
2067 fjz2 = _mm256_add_ps(fjz2,tz);
2069 fjptrA = f+j_coord_offsetA;
2070 fjptrB = f+j_coord_offsetB;
2071 fjptrC = f+j_coord_offsetC;
2072 fjptrD = f+j_coord_offsetD;
2073 fjptrE = f+j_coord_offsetE;
2074 fjptrF = f+j_coord_offsetF;
2075 fjptrG = f+j_coord_offsetG;
2076 fjptrH = f+j_coord_offsetH;
2078 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2079 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2081 /* Inner loop uses 351 flops */
2084 if(jidx<j_index_end)
2087 /* Get j neighbor index, and coordinate index */
2088 jnrlistA = jjnr[jidx];
2089 jnrlistB = jjnr[jidx+1];
2090 jnrlistC = jjnr[jidx+2];
2091 jnrlistD = jjnr[jidx+3];
2092 jnrlistE = jjnr[jidx+4];
2093 jnrlistF = jjnr[jidx+5];
2094 jnrlistG = jjnr[jidx+6];
2095 jnrlistH = jjnr[jidx+7];
2096 /* Sign of each element will be negative for non-real atoms.
2097 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2098 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2100 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2101 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2103 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2104 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2105 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2106 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2107 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2108 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2109 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2110 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2111 j_coord_offsetA = DIM*jnrA;
2112 j_coord_offsetB = DIM*jnrB;
2113 j_coord_offsetC = DIM*jnrC;
2114 j_coord_offsetD = DIM*jnrD;
2115 j_coord_offsetE = DIM*jnrE;
2116 j_coord_offsetF = DIM*jnrF;
2117 j_coord_offsetG = DIM*jnrG;
2118 j_coord_offsetH = DIM*jnrH;
2120 /* load j atom coordinates */
2121 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2122 x+j_coord_offsetC,x+j_coord_offsetD,
2123 x+j_coord_offsetE,x+j_coord_offsetF,
2124 x+j_coord_offsetG,x+j_coord_offsetH,
2125 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2127 /* Calculate displacement vector */
2128 dx00 = _mm256_sub_ps(ix0,jx0);
2129 dy00 = _mm256_sub_ps(iy0,jy0);
2130 dz00 = _mm256_sub_ps(iz0,jz0);
2131 dx01 = _mm256_sub_ps(ix0,jx1);
2132 dy01 = _mm256_sub_ps(iy0,jy1);
2133 dz01 = _mm256_sub_ps(iz0,jz1);
2134 dx02 = _mm256_sub_ps(ix0,jx2);
2135 dy02 = _mm256_sub_ps(iy0,jy2);
2136 dz02 = _mm256_sub_ps(iz0,jz2);
2137 dx10 = _mm256_sub_ps(ix1,jx0);
2138 dy10 = _mm256_sub_ps(iy1,jy0);
2139 dz10 = _mm256_sub_ps(iz1,jz0);
2140 dx11 = _mm256_sub_ps(ix1,jx1);
2141 dy11 = _mm256_sub_ps(iy1,jy1);
2142 dz11 = _mm256_sub_ps(iz1,jz1);
2143 dx12 = _mm256_sub_ps(ix1,jx2);
2144 dy12 = _mm256_sub_ps(iy1,jy2);
2145 dz12 = _mm256_sub_ps(iz1,jz2);
2146 dx20 = _mm256_sub_ps(ix2,jx0);
2147 dy20 = _mm256_sub_ps(iy2,jy0);
2148 dz20 = _mm256_sub_ps(iz2,jz0);
2149 dx21 = _mm256_sub_ps(ix2,jx1);
2150 dy21 = _mm256_sub_ps(iy2,jy1);
2151 dz21 = _mm256_sub_ps(iz2,jz1);
2152 dx22 = _mm256_sub_ps(ix2,jx2);
2153 dy22 = _mm256_sub_ps(iy2,jy2);
2154 dz22 = _mm256_sub_ps(iz2,jz2);
2156 /* Calculate squared distance and things based on it */
2157 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2158 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2159 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2160 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2161 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2162 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2163 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2164 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2165 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2167 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2168 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
2169 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
2170 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
2171 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2172 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2173 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
2174 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2175 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2177 fjx0 = _mm256_setzero_ps();
2178 fjy0 = _mm256_setzero_ps();
2179 fjz0 = _mm256_setzero_ps();
2180 fjx1 = _mm256_setzero_ps();
2181 fjy1 = _mm256_setzero_ps();
2182 fjz1 = _mm256_setzero_ps();
2183 fjx2 = _mm256_setzero_ps();
2184 fjy2 = _mm256_setzero_ps();
2185 fjz2 = _mm256_setzero_ps();
2187 /**************************
2188 * CALCULATE INTERACTIONS *
2189 **************************/
2191 r00 = _mm256_mul_ps(rsq00,rinv00);
2192 r00 = _mm256_andnot_ps(dummy_mask,r00);
2194 /* Calculate table index by multiplying r with table scale and truncate to integer */
2195 rt = _mm256_mul_ps(r00,vftabscale);
2196 vfitab = _mm256_cvttps_epi32(rt);
2197 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2198 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2199 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2200 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2201 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2202 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2204 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2205 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2206 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2207 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2208 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2209 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2210 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2211 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2212 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2213 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2214 Heps = _mm256_mul_ps(vfeps,H);
2215 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2216 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2217 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2221 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2223 /* Calculate temporary vectorial force */
2224 tx = _mm256_mul_ps(fscal,dx00);
2225 ty = _mm256_mul_ps(fscal,dy00);
2226 tz = _mm256_mul_ps(fscal,dz00);
2228 /* Update vectorial force */
2229 fix0 = _mm256_add_ps(fix0,tx);
2230 fiy0 = _mm256_add_ps(fiy0,ty);
2231 fiz0 = _mm256_add_ps(fiz0,tz);
2233 fjx0 = _mm256_add_ps(fjx0,tx);
2234 fjy0 = _mm256_add_ps(fjy0,ty);
2235 fjz0 = _mm256_add_ps(fjz0,tz);
2237 /**************************
2238 * CALCULATE INTERACTIONS *
2239 **************************/
2241 r01 = _mm256_mul_ps(rsq01,rinv01);
2242 r01 = _mm256_andnot_ps(dummy_mask,r01);
2244 /* Calculate table index by multiplying r with table scale and truncate to integer */
2245 rt = _mm256_mul_ps(r01,vftabscale);
2246 vfitab = _mm256_cvttps_epi32(rt);
2247 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2248 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2249 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2250 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2251 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2252 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2254 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2255 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2256 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2257 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2258 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2259 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2260 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2261 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2262 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2263 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2264 Heps = _mm256_mul_ps(vfeps,H);
2265 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2266 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2267 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2271 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2273 /* Calculate temporary vectorial force */
2274 tx = _mm256_mul_ps(fscal,dx01);
2275 ty = _mm256_mul_ps(fscal,dy01);
2276 tz = _mm256_mul_ps(fscal,dz01);
2278 /* Update vectorial force */
2279 fix0 = _mm256_add_ps(fix0,tx);
2280 fiy0 = _mm256_add_ps(fiy0,ty);
2281 fiz0 = _mm256_add_ps(fiz0,tz);
2283 fjx1 = _mm256_add_ps(fjx1,tx);
2284 fjy1 = _mm256_add_ps(fjy1,ty);
2285 fjz1 = _mm256_add_ps(fjz1,tz);
2287 /**************************
2288 * CALCULATE INTERACTIONS *
2289 **************************/
2291 r02 = _mm256_mul_ps(rsq02,rinv02);
2292 r02 = _mm256_andnot_ps(dummy_mask,r02);
2294 /* Calculate table index by multiplying r with table scale and truncate to integer */
2295 rt = _mm256_mul_ps(r02,vftabscale);
2296 vfitab = _mm256_cvttps_epi32(rt);
2297 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2298 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2299 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2300 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2301 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2302 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2304 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2305 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2306 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2307 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2308 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2309 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2310 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2311 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2312 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2313 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2314 Heps = _mm256_mul_ps(vfeps,H);
2315 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2316 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2317 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2321 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2323 /* Calculate temporary vectorial force */
2324 tx = _mm256_mul_ps(fscal,dx02);
2325 ty = _mm256_mul_ps(fscal,dy02);
2326 tz = _mm256_mul_ps(fscal,dz02);
2328 /* Update vectorial force */
2329 fix0 = _mm256_add_ps(fix0,tx);
2330 fiy0 = _mm256_add_ps(fiy0,ty);
2331 fiz0 = _mm256_add_ps(fiz0,tz);
2333 fjx2 = _mm256_add_ps(fjx2,tx);
2334 fjy2 = _mm256_add_ps(fjy2,ty);
2335 fjz2 = _mm256_add_ps(fjz2,tz);
2337 /**************************
2338 * CALCULATE INTERACTIONS *
2339 **************************/
2341 r10 = _mm256_mul_ps(rsq10,rinv10);
2342 r10 = _mm256_andnot_ps(dummy_mask,r10);
2344 /* Calculate table index by multiplying r with table scale and truncate to integer */
2345 rt = _mm256_mul_ps(r10,vftabscale);
2346 vfitab = _mm256_cvttps_epi32(rt);
2347 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2348 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2349 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2350 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2351 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2352 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2354 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2355 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2356 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2357 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2358 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2359 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2360 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2361 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2362 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2363 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2364 Heps = _mm256_mul_ps(vfeps,H);
2365 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2366 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2367 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2371 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2373 /* Calculate temporary vectorial force */
2374 tx = _mm256_mul_ps(fscal,dx10);
2375 ty = _mm256_mul_ps(fscal,dy10);
2376 tz = _mm256_mul_ps(fscal,dz10);
2378 /* Update vectorial force */
2379 fix1 = _mm256_add_ps(fix1,tx);
2380 fiy1 = _mm256_add_ps(fiy1,ty);
2381 fiz1 = _mm256_add_ps(fiz1,tz);
2383 fjx0 = _mm256_add_ps(fjx0,tx);
2384 fjy0 = _mm256_add_ps(fjy0,ty);
2385 fjz0 = _mm256_add_ps(fjz0,tz);
2387 /**************************
2388 * CALCULATE INTERACTIONS *
2389 **************************/
2391 r11 = _mm256_mul_ps(rsq11,rinv11);
2392 r11 = _mm256_andnot_ps(dummy_mask,r11);
2394 /* Calculate table index by multiplying r with table scale and truncate to integer */
2395 rt = _mm256_mul_ps(r11,vftabscale);
2396 vfitab = _mm256_cvttps_epi32(rt);
2397 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2398 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2399 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2400 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2401 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2402 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2404 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2405 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2406 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2407 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2408 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2409 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2410 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2411 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2412 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2413 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2414 Heps = _mm256_mul_ps(vfeps,H);
2415 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2416 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2417 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2421 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2423 /* Calculate temporary vectorial force */
2424 tx = _mm256_mul_ps(fscal,dx11);
2425 ty = _mm256_mul_ps(fscal,dy11);
2426 tz = _mm256_mul_ps(fscal,dz11);
2428 /* Update vectorial force */
2429 fix1 = _mm256_add_ps(fix1,tx);
2430 fiy1 = _mm256_add_ps(fiy1,ty);
2431 fiz1 = _mm256_add_ps(fiz1,tz);
2433 fjx1 = _mm256_add_ps(fjx1,tx);
2434 fjy1 = _mm256_add_ps(fjy1,ty);
2435 fjz1 = _mm256_add_ps(fjz1,tz);
2437 /**************************
2438 * CALCULATE INTERACTIONS *
2439 **************************/
2441 r12 = _mm256_mul_ps(rsq12,rinv12);
2442 r12 = _mm256_andnot_ps(dummy_mask,r12);
2444 /* Calculate table index by multiplying r with table scale and truncate to integer */
2445 rt = _mm256_mul_ps(r12,vftabscale);
2446 vfitab = _mm256_cvttps_epi32(rt);
2447 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2448 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2449 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2450 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2451 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2452 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2454 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2455 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2456 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2457 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2458 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2459 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2460 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2461 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2462 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2463 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2464 Heps = _mm256_mul_ps(vfeps,H);
2465 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2466 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2467 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2471 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2473 /* Calculate temporary vectorial force */
2474 tx = _mm256_mul_ps(fscal,dx12);
2475 ty = _mm256_mul_ps(fscal,dy12);
2476 tz = _mm256_mul_ps(fscal,dz12);
2478 /* Update vectorial force */
2479 fix1 = _mm256_add_ps(fix1,tx);
2480 fiy1 = _mm256_add_ps(fiy1,ty);
2481 fiz1 = _mm256_add_ps(fiz1,tz);
2483 fjx2 = _mm256_add_ps(fjx2,tx);
2484 fjy2 = _mm256_add_ps(fjy2,ty);
2485 fjz2 = _mm256_add_ps(fjz2,tz);
2487 /**************************
2488 * CALCULATE INTERACTIONS *
2489 **************************/
2491 r20 = _mm256_mul_ps(rsq20,rinv20);
2492 r20 = _mm256_andnot_ps(dummy_mask,r20);
2494 /* Calculate table index by multiplying r with table scale and truncate to integer */
2495 rt = _mm256_mul_ps(r20,vftabscale);
2496 vfitab = _mm256_cvttps_epi32(rt);
2497 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2498 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2499 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2500 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2501 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2502 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2504 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2505 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2506 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2507 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2508 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2509 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2510 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2511 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2512 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2513 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2514 Heps = _mm256_mul_ps(vfeps,H);
2515 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2516 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2517 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2521 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2523 /* Calculate temporary vectorial force */
2524 tx = _mm256_mul_ps(fscal,dx20);
2525 ty = _mm256_mul_ps(fscal,dy20);
2526 tz = _mm256_mul_ps(fscal,dz20);
2528 /* Update vectorial force */
2529 fix2 = _mm256_add_ps(fix2,tx);
2530 fiy2 = _mm256_add_ps(fiy2,ty);
2531 fiz2 = _mm256_add_ps(fiz2,tz);
2533 fjx0 = _mm256_add_ps(fjx0,tx);
2534 fjy0 = _mm256_add_ps(fjy0,ty);
2535 fjz0 = _mm256_add_ps(fjz0,tz);
2537 /**************************
2538 * CALCULATE INTERACTIONS *
2539 **************************/
2541 r21 = _mm256_mul_ps(rsq21,rinv21);
2542 r21 = _mm256_andnot_ps(dummy_mask,r21);
2544 /* Calculate table index by multiplying r with table scale and truncate to integer */
2545 rt = _mm256_mul_ps(r21,vftabscale);
2546 vfitab = _mm256_cvttps_epi32(rt);
2547 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2548 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2549 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2550 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2551 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2552 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2554 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2555 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2556 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2557 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2558 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2559 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2560 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2561 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2562 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2563 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2564 Heps = _mm256_mul_ps(vfeps,H);
2565 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2566 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2567 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2571 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2573 /* Calculate temporary vectorial force */
2574 tx = _mm256_mul_ps(fscal,dx21);
2575 ty = _mm256_mul_ps(fscal,dy21);
2576 tz = _mm256_mul_ps(fscal,dz21);
2578 /* Update vectorial force */
2579 fix2 = _mm256_add_ps(fix2,tx);
2580 fiy2 = _mm256_add_ps(fiy2,ty);
2581 fiz2 = _mm256_add_ps(fiz2,tz);
2583 fjx1 = _mm256_add_ps(fjx1,tx);
2584 fjy1 = _mm256_add_ps(fjy1,ty);
2585 fjz1 = _mm256_add_ps(fjz1,tz);
2587 /**************************
2588 * CALCULATE INTERACTIONS *
2589 **************************/
2591 r22 = _mm256_mul_ps(rsq22,rinv22);
2592 r22 = _mm256_andnot_ps(dummy_mask,r22);
2594 /* Calculate table index by multiplying r with table scale and truncate to integer */
2595 rt = _mm256_mul_ps(r22,vftabscale);
2596 vfitab = _mm256_cvttps_epi32(rt);
2597 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2598 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2599 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2600 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2601 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2602 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2604 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2605 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2606 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2607 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2608 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2609 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2610 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2611 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2612 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2613 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2614 Heps = _mm256_mul_ps(vfeps,H);
2615 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2616 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2617 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2621 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2623 /* Calculate temporary vectorial force */
2624 tx = _mm256_mul_ps(fscal,dx22);
2625 ty = _mm256_mul_ps(fscal,dy22);
2626 tz = _mm256_mul_ps(fscal,dz22);
2628 /* Update vectorial force */
2629 fix2 = _mm256_add_ps(fix2,tx);
2630 fiy2 = _mm256_add_ps(fiy2,ty);
2631 fiz2 = _mm256_add_ps(fiz2,tz);
2633 fjx2 = _mm256_add_ps(fjx2,tx);
2634 fjy2 = _mm256_add_ps(fjy2,ty);
2635 fjz2 = _mm256_add_ps(fjz2,tz);
2637 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2638 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2639 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2640 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2641 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2642 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2643 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2644 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2646 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2647 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2649 /* Inner loop uses 360 flops */
2652 /* End of innermost loop */
2654 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2655 f+i_coord_offset,fshift+i_shift_offset);
2657 /* Increment number of inner iterations */
2658 inneriter += j_index_end - j_index_start;
2660 /* Outer loop uses 18 flops */
2663 /* Increment number of outer iterations */
2666 /* Update outer/inner flops */
2668 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);