2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_single
38 * Electrostatics interaction: CubicSplineTable
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 real * vdwioffsetptr3;
79 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
80 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
81 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
82 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
83 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
84 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
85 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
86 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
87 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
88 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
89 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
90 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
91 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
92 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
93 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
94 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
95 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
96 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
97 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
98 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
101 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
104 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
105 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
107 __m128i vfitab_lo,vfitab_hi;
108 __m128i ifour = _mm_set1_epi32(4);
109 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
111 __m256 dummy_mask,cutoff_mask;
112 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
113 __m256 one = _mm256_set1_ps(1.0);
114 __m256 two = _mm256_set1_ps(2.0);
120 jindex = nlist->jindex;
122 shiftidx = nlist->shift;
124 shiftvec = fr->shift_vec[0];
125 fshift = fr->fshift[0];
126 facel = _mm256_set1_ps(fr->epsfac);
127 charge = mdatoms->chargeA;
128 nvdwtype = fr->ntype;
130 vdwtype = mdatoms->typeA;
132 vftab = kernel_data->table_elec_vdw->data;
133 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
135 /* Setup water-specific parameters */
136 inr = nlist->iinr[0];
137 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
138 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
139 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
140 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
142 jq1 = _mm256_set1_ps(charge[inr+1]);
143 jq2 = _mm256_set1_ps(charge[inr+2]);
144 jq3 = _mm256_set1_ps(charge[inr+3]);
145 vdwjidx0A = 2*vdwtype[inr+0];
146 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
147 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
148 qq11 = _mm256_mul_ps(iq1,jq1);
149 qq12 = _mm256_mul_ps(iq1,jq2);
150 qq13 = _mm256_mul_ps(iq1,jq3);
151 qq21 = _mm256_mul_ps(iq2,jq1);
152 qq22 = _mm256_mul_ps(iq2,jq2);
153 qq23 = _mm256_mul_ps(iq2,jq3);
154 qq31 = _mm256_mul_ps(iq3,jq1);
155 qq32 = _mm256_mul_ps(iq3,jq2);
156 qq33 = _mm256_mul_ps(iq3,jq3);
158 /* Avoid stupid compiler warnings */
159 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
172 for(iidx=0;iidx<4*DIM;iidx++)
177 /* Start outer loop over neighborlists */
178 for(iidx=0; iidx<nri; iidx++)
180 /* Load shift vector for this list */
181 i_shift_offset = DIM*shiftidx[iidx];
183 /* Load limits for loop over neighbors */
184 j_index_start = jindex[iidx];
185 j_index_end = jindex[iidx+1];
187 /* Get outer coordinate index */
189 i_coord_offset = DIM*inr;
191 /* Load i particle coords and add shift vector */
192 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
193 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
195 fix0 = _mm256_setzero_ps();
196 fiy0 = _mm256_setzero_ps();
197 fiz0 = _mm256_setzero_ps();
198 fix1 = _mm256_setzero_ps();
199 fiy1 = _mm256_setzero_ps();
200 fiz1 = _mm256_setzero_ps();
201 fix2 = _mm256_setzero_ps();
202 fiy2 = _mm256_setzero_ps();
203 fiz2 = _mm256_setzero_ps();
204 fix3 = _mm256_setzero_ps();
205 fiy3 = _mm256_setzero_ps();
206 fiz3 = _mm256_setzero_ps();
208 /* Reset potential sums */
209 velecsum = _mm256_setzero_ps();
210 vvdwsum = _mm256_setzero_ps();
212 /* Start inner kernel loop */
213 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
216 /* Get j neighbor index, and coordinate index */
225 j_coord_offsetA = DIM*jnrA;
226 j_coord_offsetB = DIM*jnrB;
227 j_coord_offsetC = DIM*jnrC;
228 j_coord_offsetD = DIM*jnrD;
229 j_coord_offsetE = DIM*jnrE;
230 j_coord_offsetF = DIM*jnrF;
231 j_coord_offsetG = DIM*jnrG;
232 j_coord_offsetH = DIM*jnrH;
234 /* load j atom coordinates */
235 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
236 x+j_coord_offsetC,x+j_coord_offsetD,
237 x+j_coord_offsetE,x+j_coord_offsetF,
238 x+j_coord_offsetG,x+j_coord_offsetH,
239 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
240 &jy2,&jz2,&jx3,&jy3,&jz3);
242 /* Calculate displacement vector */
243 dx00 = _mm256_sub_ps(ix0,jx0);
244 dy00 = _mm256_sub_ps(iy0,jy0);
245 dz00 = _mm256_sub_ps(iz0,jz0);
246 dx11 = _mm256_sub_ps(ix1,jx1);
247 dy11 = _mm256_sub_ps(iy1,jy1);
248 dz11 = _mm256_sub_ps(iz1,jz1);
249 dx12 = _mm256_sub_ps(ix1,jx2);
250 dy12 = _mm256_sub_ps(iy1,jy2);
251 dz12 = _mm256_sub_ps(iz1,jz2);
252 dx13 = _mm256_sub_ps(ix1,jx3);
253 dy13 = _mm256_sub_ps(iy1,jy3);
254 dz13 = _mm256_sub_ps(iz1,jz3);
255 dx21 = _mm256_sub_ps(ix2,jx1);
256 dy21 = _mm256_sub_ps(iy2,jy1);
257 dz21 = _mm256_sub_ps(iz2,jz1);
258 dx22 = _mm256_sub_ps(ix2,jx2);
259 dy22 = _mm256_sub_ps(iy2,jy2);
260 dz22 = _mm256_sub_ps(iz2,jz2);
261 dx23 = _mm256_sub_ps(ix2,jx3);
262 dy23 = _mm256_sub_ps(iy2,jy3);
263 dz23 = _mm256_sub_ps(iz2,jz3);
264 dx31 = _mm256_sub_ps(ix3,jx1);
265 dy31 = _mm256_sub_ps(iy3,jy1);
266 dz31 = _mm256_sub_ps(iz3,jz1);
267 dx32 = _mm256_sub_ps(ix3,jx2);
268 dy32 = _mm256_sub_ps(iy3,jy2);
269 dz32 = _mm256_sub_ps(iz3,jz2);
270 dx33 = _mm256_sub_ps(ix3,jx3);
271 dy33 = _mm256_sub_ps(iy3,jy3);
272 dz33 = _mm256_sub_ps(iz3,jz3);
274 /* Calculate squared distance and things based on it */
275 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
276 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
277 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
278 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
279 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
280 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
281 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
282 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
283 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
284 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
286 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
287 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
288 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
289 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
290 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
291 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
292 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
293 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
294 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
295 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
297 fjx0 = _mm256_setzero_ps();
298 fjy0 = _mm256_setzero_ps();
299 fjz0 = _mm256_setzero_ps();
300 fjx1 = _mm256_setzero_ps();
301 fjy1 = _mm256_setzero_ps();
302 fjz1 = _mm256_setzero_ps();
303 fjx2 = _mm256_setzero_ps();
304 fjy2 = _mm256_setzero_ps();
305 fjz2 = _mm256_setzero_ps();
306 fjx3 = _mm256_setzero_ps();
307 fjy3 = _mm256_setzero_ps();
308 fjz3 = _mm256_setzero_ps();
310 /**************************
311 * CALCULATE INTERACTIONS *
312 **************************/
314 r00 = _mm256_mul_ps(rsq00,rinv00);
316 /* Calculate table index by multiplying r with table scale and truncate to integer */
317 rt = _mm256_mul_ps(r00,vftabscale);
318 vfitab = _mm256_cvttps_epi32(rt);
319 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
320 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
321 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
322 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
323 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
324 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
326 /* CUBIC SPLINE TABLE DISPERSION */
327 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
328 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
329 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
330 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
331 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
332 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
333 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
334 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
335 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
336 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
337 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
338 Heps = _mm256_mul_ps(vfeps,H);
339 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
340 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
341 vvdw6 = _mm256_mul_ps(c6_00,VV);
342 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
343 fvdw6 = _mm256_mul_ps(c6_00,FF);
345 /* CUBIC SPLINE TABLE REPULSION */
346 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
347 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
348 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
349 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
350 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
351 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
352 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
353 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
354 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
355 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
356 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
357 Heps = _mm256_mul_ps(vfeps,H);
358 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
359 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
360 vvdw12 = _mm256_mul_ps(c12_00,VV);
361 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
362 fvdw12 = _mm256_mul_ps(c12_00,FF);
363 vvdw = _mm256_add_ps(vvdw12,vvdw6);
364 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
366 /* Update potential sum for this i atom from the interaction with this j atom. */
367 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
371 /* Calculate temporary vectorial force */
372 tx = _mm256_mul_ps(fscal,dx00);
373 ty = _mm256_mul_ps(fscal,dy00);
374 tz = _mm256_mul_ps(fscal,dz00);
376 /* Update vectorial force */
377 fix0 = _mm256_add_ps(fix0,tx);
378 fiy0 = _mm256_add_ps(fiy0,ty);
379 fiz0 = _mm256_add_ps(fiz0,tz);
381 fjx0 = _mm256_add_ps(fjx0,tx);
382 fjy0 = _mm256_add_ps(fjy0,ty);
383 fjz0 = _mm256_add_ps(fjz0,tz);
385 /**************************
386 * CALCULATE INTERACTIONS *
387 **************************/
389 r11 = _mm256_mul_ps(rsq11,rinv11);
391 /* Calculate table index by multiplying r with table scale and truncate to integer */
392 rt = _mm256_mul_ps(r11,vftabscale);
393 vfitab = _mm256_cvttps_epi32(rt);
394 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
395 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
396 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
397 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
398 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
399 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
401 /* CUBIC SPLINE TABLE ELECTROSTATICS */
402 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
403 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
404 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
405 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
406 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
407 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
408 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
409 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
410 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
411 Heps = _mm256_mul_ps(vfeps,H);
412 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
413 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
414 velec = _mm256_mul_ps(qq11,VV);
415 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
416 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
418 /* Update potential sum for this i atom from the interaction with this j atom. */
419 velecsum = _mm256_add_ps(velecsum,velec);
423 /* Calculate temporary vectorial force */
424 tx = _mm256_mul_ps(fscal,dx11);
425 ty = _mm256_mul_ps(fscal,dy11);
426 tz = _mm256_mul_ps(fscal,dz11);
428 /* Update vectorial force */
429 fix1 = _mm256_add_ps(fix1,tx);
430 fiy1 = _mm256_add_ps(fiy1,ty);
431 fiz1 = _mm256_add_ps(fiz1,tz);
433 fjx1 = _mm256_add_ps(fjx1,tx);
434 fjy1 = _mm256_add_ps(fjy1,ty);
435 fjz1 = _mm256_add_ps(fjz1,tz);
437 /**************************
438 * CALCULATE INTERACTIONS *
439 **************************/
441 r12 = _mm256_mul_ps(rsq12,rinv12);
443 /* Calculate table index by multiplying r with table scale and truncate to integer */
444 rt = _mm256_mul_ps(r12,vftabscale);
445 vfitab = _mm256_cvttps_epi32(rt);
446 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
447 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
448 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
449 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
450 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
451 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
453 /* CUBIC SPLINE TABLE ELECTROSTATICS */
454 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
455 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
456 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
457 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
458 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
459 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
460 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
461 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
462 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
463 Heps = _mm256_mul_ps(vfeps,H);
464 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
465 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
466 velec = _mm256_mul_ps(qq12,VV);
467 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
468 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
470 /* Update potential sum for this i atom from the interaction with this j atom. */
471 velecsum = _mm256_add_ps(velecsum,velec);
475 /* Calculate temporary vectorial force */
476 tx = _mm256_mul_ps(fscal,dx12);
477 ty = _mm256_mul_ps(fscal,dy12);
478 tz = _mm256_mul_ps(fscal,dz12);
480 /* Update vectorial force */
481 fix1 = _mm256_add_ps(fix1,tx);
482 fiy1 = _mm256_add_ps(fiy1,ty);
483 fiz1 = _mm256_add_ps(fiz1,tz);
485 fjx2 = _mm256_add_ps(fjx2,tx);
486 fjy2 = _mm256_add_ps(fjy2,ty);
487 fjz2 = _mm256_add_ps(fjz2,tz);
489 /**************************
490 * CALCULATE INTERACTIONS *
491 **************************/
493 r13 = _mm256_mul_ps(rsq13,rinv13);
495 /* Calculate table index by multiplying r with table scale and truncate to integer */
496 rt = _mm256_mul_ps(r13,vftabscale);
497 vfitab = _mm256_cvttps_epi32(rt);
498 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
499 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
500 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
501 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
502 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
503 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
505 /* CUBIC SPLINE TABLE ELECTROSTATICS */
506 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
507 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
508 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
509 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
510 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
511 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
512 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
513 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
514 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
515 Heps = _mm256_mul_ps(vfeps,H);
516 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
517 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
518 velec = _mm256_mul_ps(qq13,VV);
519 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
520 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
522 /* Update potential sum for this i atom from the interaction with this j atom. */
523 velecsum = _mm256_add_ps(velecsum,velec);
527 /* Calculate temporary vectorial force */
528 tx = _mm256_mul_ps(fscal,dx13);
529 ty = _mm256_mul_ps(fscal,dy13);
530 tz = _mm256_mul_ps(fscal,dz13);
532 /* Update vectorial force */
533 fix1 = _mm256_add_ps(fix1,tx);
534 fiy1 = _mm256_add_ps(fiy1,ty);
535 fiz1 = _mm256_add_ps(fiz1,tz);
537 fjx3 = _mm256_add_ps(fjx3,tx);
538 fjy3 = _mm256_add_ps(fjy3,ty);
539 fjz3 = _mm256_add_ps(fjz3,tz);
541 /**************************
542 * CALCULATE INTERACTIONS *
543 **************************/
545 r21 = _mm256_mul_ps(rsq21,rinv21);
547 /* Calculate table index by multiplying r with table scale and truncate to integer */
548 rt = _mm256_mul_ps(r21,vftabscale);
549 vfitab = _mm256_cvttps_epi32(rt);
550 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
551 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
552 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
553 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
554 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
555 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
557 /* CUBIC SPLINE TABLE ELECTROSTATICS */
558 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
559 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
560 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
561 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
562 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
563 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
564 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
565 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
566 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
567 Heps = _mm256_mul_ps(vfeps,H);
568 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
569 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
570 velec = _mm256_mul_ps(qq21,VV);
571 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
572 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
574 /* Update potential sum for this i atom from the interaction with this j atom. */
575 velecsum = _mm256_add_ps(velecsum,velec);
579 /* Calculate temporary vectorial force */
580 tx = _mm256_mul_ps(fscal,dx21);
581 ty = _mm256_mul_ps(fscal,dy21);
582 tz = _mm256_mul_ps(fscal,dz21);
584 /* Update vectorial force */
585 fix2 = _mm256_add_ps(fix2,tx);
586 fiy2 = _mm256_add_ps(fiy2,ty);
587 fiz2 = _mm256_add_ps(fiz2,tz);
589 fjx1 = _mm256_add_ps(fjx1,tx);
590 fjy1 = _mm256_add_ps(fjy1,ty);
591 fjz1 = _mm256_add_ps(fjz1,tz);
593 /**************************
594 * CALCULATE INTERACTIONS *
595 **************************/
597 r22 = _mm256_mul_ps(rsq22,rinv22);
599 /* Calculate table index by multiplying r with table scale and truncate to integer */
600 rt = _mm256_mul_ps(r22,vftabscale);
601 vfitab = _mm256_cvttps_epi32(rt);
602 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
603 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
604 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
605 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
606 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
607 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
609 /* CUBIC SPLINE TABLE ELECTROSTATICS */
610 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
611 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
612 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
613 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
614 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
615 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
616 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
617 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
618 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
619 Heps = _mm256_mul_ps(vfeps,H);
620 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
621 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
622 velec = _mm256_mul_ps(qq22,VV);
623 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
624 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
626 /* Update potential sum for this i atom from the interaction with this j atom. */
627 velecsum = _mm256_add_ps(velecsum,velec);
631 /* Calculate temporary vectorial force */
632 tx = _mm256_mul_ps(fscal,dx22);
633 ty = _mm256_mul_ps(fscal,dy22);
634 tz = _mm256_mul_ps(fscal,dz22);
636 /* Update vectorial force */
637 fix2 = _mm256_add_ps(fix2,tx);
638 fiy2 = _mm256_add_ps(fiy2,ty);
639 fiz2 = _mm256_add_ps(fiz2,tz);
641 fjx2 = _mm256_add_ps(fjx2,tx);
642 fjy2 = _mm256_add_ps(fjy2,ty);
643 fjz2 = _mm256_add_ps(fjz2,tz);
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
649 r23 = _mm256_mul_ps(rsq23,rinv23);
651 /* Calculate table index by multiplying r with table scale and truncate to integer */
652 rt = _mm256_mul_ps(r23,vftabscale);
653 vfitab = _mm256_cvttps_epi32(rt);
654 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
655 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
656 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
657 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
658 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
659 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
661 /* CUBIC SPLINE TABLE ELECTROSTATICS */
662 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
663 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
664 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
665 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
666 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
667 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
668 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
669 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
670 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
671 Heps = _mm256_mul_ps(vfeps,H);
672 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
673 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
674 velec = _mm256_mul_ps(qq23,VV);
675 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
676 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
678 /* Update potential sum for this i atom from the interaction with this j atom. */
679 velecsum = _mm256_add_ps(velecsum,velec);
683 /* Calculate temporary vectorial force */
684 tx = _mm256_mul_ps(fscal,dx23);
685 ty = _mm256_mul_ps(fscal,dy23);
686 tz = _mm256_mul_ps(fscal,dz23);
688 /* Update vectorial force */
689 fix2 = _mm256_add_ps(fix2,tx);
690 fiy2 = _mm256_add_ps(fiy2,ty);
691 fiz2 = _mm256_add_ps(fiz2,tz);
693 fjx3 = _mm256_add_ps(fjx3,tx);
694 fjy3 = _mm256_add_ps(fjy3,ty);
695 fjz3 = _mm256_add_ps(fjz3,tz);
697 /**************************
698 * CALCULATE INTERACTIONS *
699 **************************/
701 r31 = _mm256_mul_ps(rsq31,rinv31);
703 /* Calculate table index by multiplying r with table scale and truncate to integer */
704 rt = _mm256_mul_ps(r31,vftabscale);
705 vfitab = _mm256_cvttps_epi32(rt);
706 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
707 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
708 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
709 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
710 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
711 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
713 /* CUBIC SPLINE TABLE ELECTROSTATICS */
714 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
715 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
716 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
717 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
718 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
719 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
720 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
721 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
722 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
723 Heps = _mm256_mul_ps(vfeps,H);
724 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
725 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
726 velec = _mm256_mul_ps(qq31,VV);
727 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
728 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
730 /* Update potential sum for this i atom from the interaction with this j atom. */
731 velecsum = _mm256_add_ps(velecsum,velec);
735 /* Calculate temporary vectorial force */
736 tx = _mm256_mul_ps(fscal,dx31);
737 ty = _mm256_mul_ps(fscal,dy31);
738 tz = _mm256_mul_ps(fscal,dz31);
740 /* Update vectorial force */
741 fix3 = _mm256_add_ps(fix3,tx);
742 fiy3 = _mm256_add_ps(fiy3,ty);
743 fiz3 = _mm256_add_ps(fiz3,tz);
745 fjx1 = _mm256_add_ps(fjx1,tx);
746 fjy1 = _mm256_add_ps(fjy1,ty);
747 fjz1 = _mm256_add_ps(fjz1,tz);
749 /**************************
750 * CALCULATE INTERACTIONS *
751 **************************/
753 r32 = _mm256_mul_ps(rsq32,rinv32);
755 /* Calculate table index by multiplying r with table scale and truncate to integer */
756 rt = _mm256_mul_ps(r32,vftabscale);
757 vfitab = _mm256_cvttps_epi32(rt);
758 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
759 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
760 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
761 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
762 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
763 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
765 /* CUBIC SPLINE TABLE ELECTROSTATICS */
766 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
767 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
768 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
769 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
770 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
771 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
772 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
773 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
774 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
775 Heps = _mm256_mul_ps(vfeps,H);
776 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
777 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
778 velec = _mm256_mul_ps(qq32,VV);
779 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
780 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
782 /* Update potential sum for this i atom from the interaction with this j atom. */
783 velecsum = _mm256_add_ps(velecsum,velec);
787 /* Calculate temporary vectorial force */
788 tx = _mm256_mul_ps(fscal,dx32);
789 ty = _mm256_mul_ps(fscal,dy32);
790 tz = _mm256_mul_ps(fscal,dz32);
792 /* Update vectorial force */
793 fix3 = _mm256_add_ps(fix3,tx);
794 fiy3 = _mm256_add_ps(fiy3,ty);
795 fiz3 = _mm256_add_ps(fiz3,tz);
797 fjx2 = _mm256_add_ps(fjx2,tx);
798 fjy2 = _mm256_add_ps(fjy2,ty);
799 fjz2 = _mm256_add_ps(fjz2,tz);
801 /**************************
802 * CALCULATE INTERACTIONS *
803 **************************/
805 r33 = _mm256_mul_ps(rsq33,rinv33);
807 /* Calculate table index by multiplying r with table scale and truncate to integer */
808 rt = _mm256_mul_ps(r33,vftabscale);
809 vfitab = _mm256_cvttps_epi32(rt);
810 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
811 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
812 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
813 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
814 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
815 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
817 /* CUBIC SPLINE TABLE ELECTROSTATICS */
818 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
819 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
820 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
821 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
822 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
823 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
824 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
825 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
826 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
827 Heps = _mm256_mul_ps(vfeps,H);
828 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
829 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
830 velec = _mm256_mul_ps(qq33,VV);
831 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
832 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
834 /* Update potential sum for this i atom from the interaction with this j atom. */
835 velecsum = _mm256_add_ps(velecsum,velec);
839 /* Calculate temporary vectorial force */
840 tx = _mm256_mul_ps(fscal,dx33);
841 ty = _mm256_mul_ps(fscal,dy33);
842 tz = _mm256_mul_ps(fscal,dz33);
844 /* Update vectorial force */
845 fix3 = _mm256_add_ps(fix3,tx);
846 fiy3 = _mm256_add_ps(fiy3,ty);
847 fiz3 = _mm256_add_ps(fiz3,tz);
849 fjx3 = _mm256_add_ps(fjx3,tx);
850 fjy3 = _mm256_add_ps(fjy3,ty);
851 fjz3 = _mm256_add_ps(fjz3,tz);
853 fjptrA = f+j_coord_offsetA;
854 fjptrB = f+j_coord_offsetB;
855 fjptrC = f+j_coord_offsetC;
856 fjptrD = f+j_coord_offsetD;
857 fjptrE = f+j_coord_offsetE;
858 fjptrF = f+j_coord_offsetF;
859 fjptrG = f+j_coord_offsetG;
860 fjptrH = f+j_coord_offsetH;
862 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
863 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
864 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
866 /* Inner loop uses 446 flops */
872 /* Get j neighbor index, and coordinate index */
873 jnrlistA = jjnr[jidx];
874 jnrlistB = jjnr[jidx+1];
875 jnrlistC = jjnr[jidx+2];
876 jnrlistD = jjnr[jidx+3];
877 jnrlistE = jjnr[jidx+4];
878 jnrlistF = jjnr[jidx+5];
879 jnrlistG = jjnr[jidx+6];
880 jnrlistH = jjnr[jidx+7];
881 /* Sign of each element will be negative for non-real atoms.
882 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
883 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
885 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
886 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
888 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
889 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
890 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
891 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
892 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
893 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
894 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
895 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
896 j_coord_offsetA = DIM*jnrA;
897 j_coord_offsetB = DIM*jnrB;
898 j_coord_offsetC = DIM*jnrC;
899 j_coord_offsetD = DIM*jnrD;
900 j_coord_offsetE = DIM*jnrE;
901 j_coord_offsetF = DIM*jnrF;
902 j_coord_offsetG = DIM*jnrG;
903 j_coord_offsetH = DIM*jnrH;
905 /* load j atom coordinates */
906 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
907 x+j_coord_offsetC,x+j_coord_offsetD,
908 x+j_coord_offsetE,x+j_coord_offsetF,
909 x+j_coord_offsetG,x+j_coord_offsetH,
910 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
911 &jy2,&jz2,&jx3,&jy3,&jz3);
913 /* Calculate displacement vector */
914 dx00 = _mm256_sub_ps(ix0,jx0);
915 dy00 = _mm256_sub_ps(iy0,jy0);
916 dz00 = _mm256_sub_ps(iz0,jz0);
917 dx11 = _mm256_sub_ps(ix1,jx1);
918 dy11 = _mm256_sub_ps(iy1,jy1);
919 dz11 = _mm256_sub_ps(iz1,jz1);
920 dx12 = _mm256_sub_ps(ix1,jx2);
921 dy12 = _mm256_sub_ps(iy1,jy2);
922 dz12 = _mm256_sub_ps(iz1,jz2);
923 dx13 = _mm256_sub_ps(ix1,jx3);
924 dy13 = _mm256_sub_ps(iy1,jy3);
925 dz13 = _mm256_sub_ps(iz1,jz3);
926 dx21 = _mm256_sub_ps(ix2,jx1);
927 dy21 = _mm256_sub_ps(iy2,jy1);
928 dz21 = _mm256_sub_ps(iz2,jz1);
929 dx22 = _mm256_sub_ps(ix2,jx2);
930 dy22 = _mm256_sub_ps(iy2,jy2);
931 dz22 = _mm256_sub_ps(iz2,jz2);
932 dx23 = _mm256_sub_ps(ix2,jx3);
933 dy23 = _mm256_sub_ps(iy2,jy3);
934 dz23 = _mm256_sub_ps(iz2,jz3);
935 dx31 = _mm256_sub_ps(ix3,jx1);
936 dy31 = _mm256_sub_ps(iy3,jy1);
937 dz31 = _mm256_sub_ps(iz3,jz1);
938 dx32 = _mm256_sub_ps(ix3,jx2);
939 dy32 = _mm256_sub_ps(iy3,jy2);
940 dz32 = _mm256_sub_ps(iz3,jz2);
941 dx33 = _mm256_sub_ps(ix3,jx3);
942 dy33 = _mm256_sub_ps(iy3,jy3);
943 dz33 = _mm256_sub_ps(iz3,jz3);
945 /* Calculate squared distance and things based on it */
946 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
947 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
948 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
949 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
950 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
951 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
952 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
953 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
954 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
955 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
957 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
958 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
959 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
960 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
961 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
962 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
963 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
964 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
965 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
966 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
968 fjx0 = _mm256_setzero_ps();
969 fjy0 = _mm256_setzero_ps();
970 fjz0 = _mm256_setzero_ps();
971 fjx1 = _mm256_setzero_ps();
972 fjy1 = _mm256_setzero_ps();
973 fjz1 = _mm256_setzero_ps();
974 fjx2 = _mm256_setzero_ps();
975 fjy2 = _mm256_setzero_ps();
976 fjz2 = _mm256_setzero_ps();
977 fjx3 = _mm256_setzero_ps();
978 fjy3 = _mm256_setzero_ps();
979 fjz3 = _mm256_setzero_ps();
981 /**************************
982 * CALCULATE INTERACTIONS *
983 **************************/
985 r00 = _mm256_mul_ps(rsq00,rinv00);
986 r00 = _mm256_andnot_ps(dummy_mask,r00);
988 /* Calculate table index by multiplying r with table scale and truncate to integer */
989 rt = _mm256_mul_ps(r00,vftabscale);
990 vfitab = _mm256_cvttps_epi32(rt);
991 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
992 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
993 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
994 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
995 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
996 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
998 /* CUBIC SPLINE TABLE DISPERSION */
999 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1000 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1001 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1002 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1003 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1004 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1005 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1006 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1007 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1008 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1009 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1010 Heps = _mm256_mul_ps(vfeps,H);
1011 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1012 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1013 vvdw6 = _mm256_mul_ps(c6_00,VV);
1014 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1015 fvdw6 = _mm256_mul_ps(c6_00,FF);
1017 /* CUBIC SPLINE TABLE REPULSION */
1018 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1019 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1020 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1021 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1022 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1023 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1024 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1025 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1026 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1027 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1028 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1029 Heps = _mm256_mul_ps(vfeps,H);
1030 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1031 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1032 vvdw12 = _mm256_mul_ps(c12_00,VV);
1033 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1034 fvdw12 = _mm256_mul_ps(c12_00,FF);
1035 vvdw = _mm256_add_ps(vvdw12,vvdw6);
1036 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1038 /* Update potential sum for this i atom from the interaction with this j atom. */
1039 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
1040 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
1044 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1046 /* Calculate temporary vectorial force */
1047 tx = _mm256_mul_ps(fscal,dx00);
1048 ty = _mm256_mul_ps(fscal,dy00);
1049 tz = _mm256_mul_ps(fscal,dz00);
1051 /* Update vectorial force */
1052 fix0 = _mm256_add_ps(fix0,tx);
1053 fiy0 = _mm256_add_ps(fiy0,ty);
1054 fiz0 = _mm256_add_ps(fiz0,tz);
1056 fjx0 = _mm256_add_ps(fjx0,tx);
1057 fjy0 = _mm256_add_ps(fjy0,ty);
1058 fjz0 = _mm256_add_ps(fjz0,tz);
1060 /**************************
1061 * CALCULATE INTERACTIONS *
1062 **************************/
1064 r11 = _mm256_mul_ps(rsq11,rinv11);
1065 r11 = _mm256_andnot_ps(dummy_mask,r11);
1067 /* Calculate table index by multiplying r with table scale and truncate to integer */
1068 rt = _mm256_mul_ps(r11,vftabscale);
1069 vfitab = _mm256_cvttps_epi32(rt);
1070 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1071 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1072 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1073 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1074 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1075 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1077 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1078 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1079 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1080 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1081 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1082 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1083 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1084 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1085 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1086 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1087 Heps = _mm256_mul_ps(vfeps,H);
1088 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1089 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1090 velec = _mm256_mul_ps(qq11,VV);
1091 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1092 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1094 /* Update potential sum for this i atom from the interaction with this j atom. */
1095 velec = _mm256_andnot_ps(dummy_mask,velec);
1096 velecsum = _mm256_add_ps(velecsum,velec);
1100 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1102 /* Calculate temporary vectorial force */
1103 tx = _mm256_mul_ps(fscal,dx11);
1104 ty = _mm256_mul_ps(fscal,dy11);
1105 tz = _mm256_mul_ps(fscal,dz11);
1107 /* Update vectorial force */
1108 fix1 = _mm256_add_ps(fix1,tx);
1109 fiy1 = _mm256_add_ps(fiy1,ty);
1110 fiz1 = _mm256_add_ps(fiz1,tz);
1112 fjx1 = _mm256_add_ps(fjx1,tx);
1113 fjy1 = _mm256_add_ps(fjy1,ty);
1114 fjz1 = _mm256_add_ps(fjz1,tz);
1116 /**************************
1117 * CALCULATE INTERACTIONS *
1118 **************************/
1120 r12 = _mm256_mul_ps(rsq12,rinv12);
1121 r12 = _mm256_andnot_ps(dummy_mask,r12);
1123 /* Calculate table index by multiplying r with table scale and truncate to integer */
1124 rt = _mm256_mul_ps(r12,vftabscale);
1125 vfitab = _mm256_cvttps_epi32(rt);
1126 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1127 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1128 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1129 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1130 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1131 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1133 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1134 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1135 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1136 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1137 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1138 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1139 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1140 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1141 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1142 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1143 Heps = _mm256_mul_ps(vfeps,H);
1144 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1145 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1146 velec = _mm256_mul_ps(qq12,VV);
1147 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1148 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1150 /* Update potential sum for this i atom from the interaction with this j atom. */
1151 velec = _mm256_andnot_ps(dummy_mask,velec);
1152 velecsum = _mm256_add_ps(velecsum,velec);
1156 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1158 /* Calculate temporary vectorial force */
1159 tx = _mm256_mul_ps(fscal,dx12);
1160 ty = _mm256_mul_ps(fscal,dy12);
1161 tz = _mm256_mul_ps(fscal,dz12);
1163 /* Update vectorial force */
1164 fix1 = _mm256_add_ps(fix1,tx);
1165 fiy1 = _mm256_add_ps(fiy1,ty);
1166 fiz1 = _mm256_add_ps(fiz1,tz);
1168 fjx2 = _mm256_add_ps(fjx2,tx);
1169 fjy2 = _mm256_add_ps(fjy2,ty);
1170 fjz2 = _mm256_add_ps(fjz2,tz);
1172 /**************************
1173 * CALCULATE INTERACTIONS *
1174 **************************/
1176 r13 = _mm256_mul_ps(rsq13,rinv13);
1177 r13 = _mm256_andnot_ps(dummy_mask,r13);
1179 /* Calculate table index by multiplying r with table scale and truncate to integer */
1180 rt = _mm256_mul_ps(r13,vftabscale);
1181 vfitab = _mm256_cvttps_epi32(rt);
1182 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1183 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1184 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1185 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1186 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1187 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1189 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1190 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1191 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1192 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1193 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1194 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1195 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1196 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1197 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1198 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1199 Heps = _mm256_mul_ps(vfeps,H);
1200 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1201 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1202 velec = _mm256_mul_ps(qq13,VV);
1203 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1204 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
1206 /* Update potential sum for this i atom from the interaction with this j atom. */
1207 velec = _mm256_andnot_ps(dummy_mask,velec);
1208 velecsum = _mm256_add_ps(velecsum,velec);
1212 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1214 /* Calculate temporary vectorial force */
1215 tx = _mm256_mul_ps(fscal,dx13);
1216 ty = _mm256_mul_ps(fscal,dy13);
1217 tz = _mm256_mul_ps(fscal,dz13);
1219 /* Update vectorial force */
1220 fix1 = _mm256_add_ps(fix1,tx);
1221 fiy1 = _mm256_add_ps(fiy1,ty);
1222 fiz1 = _mm256_add_ps(fiz1,tz);
1224 fjx3 = _mm256_add_ps(fjx3,tx);
1225 fjy3 = _mm256_add_ps(fjy3,ty);
1226 fjz3 = _mm256_add_ps(fjz3,tz);
1228 /**************************
1229 * CALCULATE INTERACTIONS *
1230 **************************/
1232 r21 = _mm256_mul_ps(rsq21,rinv21);
1233 r21 = _mm256_andnot_ps(dummy_mask,r21);
1235 /* Calculate table index by multiplying r with table scale and truncate to integer */
1236 rt = _mm256_mul_ps(r21,vftabscale);
1237 vfitab = _mm256_cvttps_epi32(rt);
1238 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1239 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1240 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1241 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1242 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1243 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1245 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1246 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1247 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1248 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1249 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1250 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1251 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1252 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1253 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1254 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1255 Heps = _mm256_mul_ps(vfeps,H);
1256 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1257 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1258 velec = _mm256_mul_ps(qq21,VV);
1259 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1260 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1262 /* Update potential sum for this i atom from the interaction with this j atom. */
1263 velec = _mm256_andnot_ps(dummy_mask,velec);
1264 velecsum = _mm256_add_ps(velecsum,velec);
1268 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1270 /* Calculate temporary vectorial force */
1271 tx = _mm256_mul_ps(fscal,dx21);
1272 ty = _mm256_mul_ps(fscal,dy21);
1273 tz = _mm256_mul_ps(fscal,dz21);
1275 /* Update vectorial force */
1276 fix2 = _mm256_add_ps(fix2,tx);
1277 fiy2 = _mm256_add_ps(fiy2,ty);
1278 fiz2 = _mm256_add_ps(fiz2,tz);
1280 fjx1 = _mm256_add_ps(fjx1,tx);
1281 fjy1 = _mm256_add_ps(fjy1,ty);
1282 fjz1 = _mm256_add_ps(fjz1,tz);
1284 /**************************
1285 * CALCULATE INTERACTIONS *
1286 **************************/
1288 r22 = _mm256_mul_ps(rsq22,rinv22);
1289 r22 = _mm256_andnot_ps(dummy_mask,r22);
1291 /* Calculate table index by multiplying r with table scale and truncate to integer */
1292 rt = _mm256_mul_ps(r22,vftabscale);
1293 vfitab = _mm256_cvttps_epi32(rt);
1294 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1295 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1296 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1297 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1298 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1299 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1301 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1302 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1303 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1304 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1305 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1306 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1307 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1308 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1309 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1310 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1311 Heps = _mm256_mul_ps(vfeps,H);
1312 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1313 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1314 velec = _mm256_mul_ps(qq22,VV);
1315 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1316 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1318 /* Update potential sum for this i atom from the interaction with this j atom. */
1319 velec = _mm256_andnot_ps(dummy_mask,velec);
1320 velecsum = _mm256_add_ps(velecsum,velec);
1324 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1326 /* Calculate temporary vectorial force */
1327 tx = _mm256_mul_ps(fscal,dx22);
1328 ty = _mm256_mul_ps(fscal,dy22);
1329 tz = _mm256_mul_ps(fscal,dz22);
1331 /* Update vectorial force */
1332 fix2 = _mm256_add_ps(fix2,tx);
1333 fiy2 = _mm256_add_ps(fiy2,ty);
1334 fiz2 = _mm256_add_ps(fiz2,tz);
1336 fjx2 = _mm256_add_ps(fjx2,tx);
1337 fjy2 = _mm256_add_ps(fjy2,ty);
1338 fjz2 = _mm256_add_ps(fjz2,tz);
1340 /**************************
1341 * CALCULATE INTERACTIONS *
1342 **************************/
1344 r23 = _mm256_mul_ps(rsq23,rinv23);
1345 r23 = _mm256_andnot_ps(dummy_mask,r23);
1347 /* Calculate table index by multiplying r with table scale and truncate to integer */
1348 rt = _mm256_mul_ps(r23,vftabscale);
1349 vfitab = _mm256_cvttps_epi32(rt);
1350 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1351 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1352 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1353 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1354 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1355 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1357 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1358 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1359 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1360 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1361 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1362 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1363 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1364 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1365 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1366 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1367 Heps = _mm256_mul_ps(vfeps,H);
1368 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1369 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1370 velec = _mm256_mul_ps(qq23,VV);
1371 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1372 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
1374 /* Update potential sum for this i atom from the interaction with this j atom. */
1375 velec = _mm256_andnot_ps(dummy_mask,velec);
1376 velecsum = _mm256_add_ps(velecsum,velec);
1380 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1382 /* Calculate temporary vectorial force */
1383 tx = _mm256_mul_ps(fscal,dx23);
1384 ty = _mm256_mul_ps(fscal,dy23);
1385 tz = _mm256_mul_ps(fscal,dz23);
1387 /* Update vectorial force */
1388 fix2 = _mm256_add_ps(fix2,tx);
1389 fiy2 = _mm256_add_ps(fiy2,ty);
1390 fiz2 = _mm256_add_ps(fiz2,tz);
1392 fjx3 = _mm256_add_ps(fjx3,tx);
1393 fjy3 = _mm256_add_ps(fjy3,ty);
1394 fjz3 = _mm256_add_ps(fjz3,tz);
1396 /**************************
1397 * CALCULATE INTERACTIONS *
1398 **************************/
1400 r31 = _mm256_mul_ps(rsq31,rinv31);
1401 r31 = _mm256_andnot_ps(dummy_mask,r31);
1403 /* Calculate table index by multiplying r with table scale and truncate to integer */
1404 rt = _mm256_mul_ps(r31,vftabscale);
1405 vfitab = _mm256_cvttps_epi32(rt);
1406 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1407 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1408 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1409 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1410 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1411 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1413 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1414 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1415 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1416 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1417 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1418 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1419 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1420 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1421 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1422 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1423 Heps = _mm256_mul_ps(vfeps,H);
1424 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1425 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1426 velec = _mm256_mul_ps(qq31,VV);
1427 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1428 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
1430 /* Update potential sum for this i atom from the interaction with this j atom. */
1431 velec = _mm256_andnot_ps(dummy_mask,velec);
1432 velecsum = _mm256_add_ps(velecsum,velec);
1436 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1438 /* Calculate temporary vectorial force */
1439 tx = _mm256_mul_ps(fscal,dx31);
1440 ty = _mm256_mul_ps(fscal,dy31);
1441 tz = _mm256_mul_ps(fscal,dz31);
1443 /* Update vectorial force */
1444 fix3 = _mm256_add_ps(fix3,tx);
1445 fiy3 = _mm256_add_ps(fiy3,ty);
1446 fiz3 = _mm256_add_ps(fiz3,tz);
1448 fjx1 = _mm256_add_ps(fjx1,tx);
1449 fjy1 = _mm256_add_ps(fjy1,ty);
1450 fjz1 = _mm256_add_ps(fjz1,tz);
1452 /**************************
1453 * CALCULATE INTERACTIONS *
1454 **************************/
1456 r32 = _mm256_mul_ps(rsq32,rinv32);
1457 r32 = _mm256_andnot_ps(dummy_mask,r32);
1459 /* Calculate table index by multiplying r with table scale and truncate to integer */
1460 rt = _mm256_mul_ps(r32,vftabscale);
1461 vfitab = _mm256_cvttps_epi32(rt);
1462 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1463 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1464 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1465 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1466 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1467 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1469 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1470 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1471 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1472 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1473 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1474 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1475 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1476 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1477 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1478 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1479 Heps = _mm256_mul_ps(vfeps,H);
1480 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1481 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1482 velec = _mm256_mul_ps(qq32,VV);
1483 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1484 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
1486 /* Update potential sum for this i atom from the interaction with this j atom. */
1487 velec = _mm256_andnot_ps(dummy_mask,velec);
1488 velecsum = _mm256_add_ps(velecsum,velec);
1492 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1494 /* Calculate temporary vectorial force */
1495 tx = _mm256_mul_ps(fscal,dx32);
1496 ty = _mm256_mul_ps(fscal,dy32);
1497 tz = _mm256_mul_ps(fscal,dz32);
1499 /* Update vectorial force */
1500 fix3 = _mm256_add_ps(fix3,tx);
1501 fiy3 = _mm256_add_ps(fiy3,ty);
1502 fiz3 = _mm256_add_ps(fiz3,tz);
1504 fjx2 = _mm256_add_ps(fjx2,tx);
1505 fjy2 = _mm256_add_ps(fjy2,ty);
1506 fjz2 = _mm256_add_ps(fjz2,tz);
1508 /**************************
1509 * CALCULATE INTERACTIONS *
1510 **************************/
1512 r33 = _mm256_mul_ps(rsq33,rinv33);
1513 r33 = _mm256_andnot_ps(dummy_mask,r33);
1515 /* Calculate table index by multiplying r with table scale and truncate to integer */
1516 rt = _mm256_mul_ps(r33,vftabscale);
1517 vfitab = _mm256_cvttps_epi32(rt);
1518 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1519 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1520 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1521 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1522 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1523 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1525 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1526 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1527 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1528 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1529 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1530 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1531 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1532 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1533 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1534 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1535 Heps = _mm256_mul_ps(vfeps,H);
1536 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1537 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1538 velec = _mm256_mul_ps(qq33,VV);
1539 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1540 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
1542 /* Update potential sum for this i atom from the interaction with this j atom. */
1543 velec = _mm256_andnot_ps(dummy_mask,velec);
1544 velecsum = _mm256_add_ps(velecsum,velec);
1548 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1550 /* Calculate temporary vectorial force */
1551 tx = _mm256_mul_ps(fscal,dx33);
1552 ty = _mm256_mul_ps(fscal,dy33);
1553 tz = _mm256_mul_ps(fscal,dz33);
1555 /* Update vectorial force */
1556 fix3 = _mm256_add_ps(fix3,tx);
1557 fiy3 = _mm256_add_ps(fiy3,ty);
1558 fiz3 = _mm256_add_ps(fiz3,tz);
1560 fjx3 = _mm256_add_ps(fjx3,tx);
1561 fjy3 = _mm256_add_ps(fjy3,ty);
1562 fjz3 = _mm256_add_ps(fjz3,tz);
1564 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1565 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1566 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1567 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1568 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1569 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1570 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1571 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1573 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1574 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1575 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1577 /* Inner loop uses 456 flops */
1580 /* End of innermost loop */
1582 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1583 f+i_coord_offset,fshift+i_shift_offset);
1586 /* Update potential energies */
1587 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1588 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1590 /* Increment number of inner iterations */
1591 inneriter += j_index_end - j_index_start;
1593 /* Outer loop uses 26 flops */
1596 /* Increment number of outer iterations */
1599 /* Update outer/inner flops */
1601 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*456);
1604 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_single
1605 * Electrostatics interaction: CubicSplineTable
1606 * VdW interaction: CubicSplineTable
1607 * Geometry: Water4-Water4
1608 * Calculate force/pot: Force
1611 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_single
1612 (t_nblist * gmx_restrict nlist,
1613 rvec * gmx_restrict xx,
1614 rvec * gmx_restrict ff,
1615 t_forcerec * gmx_restrict fr,
1616 t_mdatoms * gmx_restrict mdatoms,
1617 nb_kernel_data_t * gmx_restrict kernel_data,
1618 t_nrnb * gmx_restrict nrnb)
1620 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1621 * just 0 for non-waters.
1622 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1623 * jnr indices corresponding to data put in the four positions in the SIMD register.
1625 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1626 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1627 int jnrA,jnrB,jnrC,jnrD;
1628 int jnrE,jnrF,jnrG,jnrH;
1629 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1630 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1631 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1632 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1633 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1634 real rcutoff_scalar;
1635 real *shiftvec,*fshift,*x,*f;
1636 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1637 real scratch[4*DIM];
1638 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1639 real * vdwioffsetptr0;
1640 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1641 real * vdwioffsetptr1;
1642 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1643 real * vdwioffsetptr2;
1644 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1645 real * vdwioffsetptr3;
1646 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1647 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1648 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1649 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1650 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1651 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1652 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1653 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1654 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1655 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1656 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1657 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1658 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1659 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1660 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1661 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1662 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1663 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1664 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1665 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1668 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1671 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1672 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1674 __m128i vfitab_lo,vfitab_hi;
1675 __m128i ifour = _mm_set1_epi32(4);
1676 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1678 __m256 dummy_mask,cutoff_mask;
1679 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1680 __m256 one = _mm256_set1_ps(1.0);
1681 __m256 two = _mm256_set1_ps(2.0);
1687 jindex = nlist->jindex;
1689 shiftidx = nlist->shift;
1691 shiftvec = fr->shift_vec[0];
1692 fshift = fr->fshift[0];
1693 facel = _mm256_set1_ps(fr->epsfac);
1694 charge = mdatoms->chargeA;
1695 nvdwtype = fr->ntype;
1696 vdwparam = fr->nbfp;
1697 vdwtype = mdatoms->typeA;
1699 vftab = kernel_data->table_elec_vdw->data;
1700 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
1702 /* Setup water-specific parameters */
1703 inr = nlist->iinr[0];
1704 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1705 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1706 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1707 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1709 jq1 = _mm256_set1_ps(charge[inr+1]);
1710 jq2 = _mm256_set1_ps(charge[inr+2]);
1711 jq3 = _mm256_set1_ps(charge[inr+3]);
1712 vdwjidx0A = 2*vdwtype[inr+0];
1713 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1714 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1715 qq11 = _mm256_mul_ps(iq1,jq1);
1716 qq12 = _mm256_mul_ps(iq1,jq2);
1717 qq13 = _mm256_mul_ps(iq1,jq3);
1718 qq21 = _mm256_mul_ps(iq2,jq1);
1719 qq22 = _mm256_mul_ps(iq2,jq2);
1720 qq23 = _mm256_mul_ps(iq2,jq3);
1721 qq31 = _mm256_mul_ps(iq3,jq1);
1722 qq32 = _mm256_mul_ps(iq3,jq2);
1723 qq33 = _mm256_mul_ps(iq3,jq3);
1725 /* Avoid stupid compiler warnings */
1726 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1727 j_coord_offsetA = 0;
1728 j_coord_offsetB = 0;
1729 j_coord_offsetC = 0;
1730 j_coord_offsetD = 0;
1731 j_coord_offsetE = 0;
1732 j_coord_offsetF = 0;
1733 j_coord_offsetG = 0;
1734 j_coord_offsetH = 0;
1739 for(iidx=0;iidx<4*DIM;iidx++)
1741 scratch[iidx] = 0.0;
1744 /* Start outer loop over neighborlists */
1745 for(iidx=0; iidx<nri; iidx++)
1747 /* Load shift vector for this list */
1748 i_shift_offset = DIM*shiftidx[iidx];
1750 /* Load limits for loop over neighbors */
1751 j_index_start = jindex[iidx];
1752 j_index_end = jindex[iidx+1];
1754 /* Get outer coordinate index */
1756 i_coord_offset = DIM*inr;
1758 /* Load i particle coords and add shift vector */
1759 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1760 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1762 fix0 = _mm256_setzero_ps();
1763 fiy0 = _mm256_setzero_ps();
1764 fiz0 = _mm256_setzero_ps();
1765 fix1 = _mm256_setzero_ps();
1766 fiy1 = _mm256_setzero_ps();
1767 fiz1 = _mm256_setzero_ps();
1768 fix2 = _mm256_setzero_ps();
1769 fiy2 = _mm256_setzero_ps();
1770 fiz2 = _mm256_setzero_ps();
1771 fix3 = _mm256_setzero_ps();
1772 fiy3 = _mm256_setzero_ps();
1773 fiz3 = _mm256_setzero_ps();
1775 /* Start inner kernel loop */
1776 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1779 /* Get j neighbor index, and coordinate index */
1781 jnrB = jjnr[jidx+1];
1782 jnrC = jjnr[jidx+2];
1783 jnrD = jjnr[jidx+3];
1784 jnrE = jjnr[jidx+4];
1785 jnrF = jjnr[jidx+5];
1786 jnrG = jjnr[jidx+6];
1787 jnrH = jjnr[jidx+7];
1788 j_coord_offsetA = DIM*jnrA;
1789 j_coord_offsetB = DIM*jnrB;
1790 j_coord_offsetC = DIM*jnrC;
1791 j_coord_offsetD = DIM*jnrD;
1792 j_coord_offsetE = DIM*jnrE;
1793 j_coord_offsetF = DIM*jnrF;
1794 j_coord_offsetG = DIM*jnrG;
1795 j_coord_offsetH = DIM*jnrH;
1797 /* load j atom coordinates */
1798 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1799 x+j_coord_offsetC,x+j_coord_offsetD,
1800 x+j_coord_offsetE,x+j_coord_offsetF,
1801 x+j_coord_offsetG,x+j_coord_offsetH,
1802 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1803 &jy2,&jz2,&jx3,&jy3,&jz3);
1805 /* Calculate displacement vector */
1806 dx00 = _mm256_sub_ps(ix0,jx0);
1807 dy00 = _mm256_sub_ps(iy0,jy0);
1808 dz00 = _mm256_sub_ps(iz0,jz0);
1809 dx11 = _mm256_sub_ps(ix1,jx1);
1810 dy11 = _mm256_sub_ps(iy1,jy1);
1811 dz11 = _mm256_sub_ps(iz1,jz1);
1812 dx12 = _mm256_sub_ps(ix1,jx2);
1813 dy12 = _mm256_sub_ps(iy1,jy2);
1814 dz12 = _mm256_sub_ps(iz1,jz2);
1815 dx13 = _mm256_sub_ps(ix1,jx3);
1816 dy13 = _mm256_sub_ps(iy1,jy3);
1817 dz13 = _mm256_sub_ps(iz1,jz3);
1818 dx21 = _mm256_sub_ps(ix2,jx1);
1819 dy21 = _mm256_sub_ps(iy2,jy1);
1820 dz21 = _mm256_sub_ps(iz2,jz1);
1821 dx22 = _mm256_sub_ps(ix2,jx2);
1822 dy22 = _mm256_sub_ps(iy2,jy2);
1823 dz22 = _mm256_sub_ps(iz2,jz2);
1824 dx23 = _mm256_sub_ps(ix2,jx3);
1825 dy23 = _mm256_sub_ps(iy2,jy3);
1826 dz23 = _mm256_sub_ps(iz2,jz3);
1827 dx31 = _mm256_sub_ps(ix3,jx1);
1828 dy31 = _mm256_sub_ps(iy3,jy1);
1829 dz31 = _mm256_sub_ps(iz3,jz1);
1830 dx32 = _mm256_sub_ps(ix3,jx2);
1831 dy32 = _mm256_sub_ps(iy3,jy2);
1832 dz32 = _mm256_sub_ps(iz3,jz2);
1833 dx33 = _mm256_sub_ps(ix3,jx3);
1834 dy33 = _mm256_sub_ps(iy3,jy3);
1835 dz33 = _mm256_sub_ps(iz3,jz3);
1837 /* Calculate squared distance and things based on it */
1838 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1839 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1840 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1841 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1842 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1843 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1844 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1845 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1846 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1847 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1849 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1850 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1851 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1852 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1853 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1854 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1855 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1856 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1857 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1858 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1860 fjx0 = _mm256_setzero_ps();
1861 fjy0 = _mm256_setzero_ps();
1862 fjz0 = _mm256_setzero_ps();
1863 fjx1 = _mm256_setzero_ps();
1864 fjy1 = _mm256_setzero_ps();
1865 fjz1 = _mm256_setzero_ps();
1866 fjx2 = _mm256_setzero_ps();
1867 fjy2 = _mm256_setzero_ps();
1868 fjz2 = _mm256_setzero_ps();
1869 fjx3 = _mm256_setzero_ps();
1870 fjy3 = _mm256_setzero_ps();
1871 fjz3 = _mm256_setzero_ps();
1873 /**************************
1874 * CALCULATE INTERACTIONS *
1875 **************************/
1877 r00 = _mm256_mul_ps(rsq00,rinv00);
1879 /* Calculate table index by multiplying r with table scale and truncate to integer */
1880 rt = _mm256_mul_ps(r00,vftabscale);
1881 vfitab = _mm256_cvttps_epi32(rt);
1882 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1883 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1884 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1885 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1886 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1887 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1889 /* CUBIC SPLINE TABLE DISPERSION */
1890 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1891 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1892 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1893 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1894 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1895 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1896 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1897 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1898 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1899 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1900 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1901 Heps = _mm256_mul_ps(vfeps,H);
1902 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1903 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1904 fvdw6 = _mm256_mul_ps(c6_00,FF);
1906 /* CUBIC SPLINE TABLE REPULSION */
1907 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1908 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1909 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1910 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1911 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1912 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1913 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1914 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1915 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1916 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1917 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1918 Heps = _mm256_mul_ps(vfeps,H);
1919 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1920 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1921 fvdw12 = _mm256_mul_ps(c12_00,FF);
1922 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1926 /* Calculate temporary vectorial force */
1927 tx = _mm256_mul_ps(fscal,dx00);
1928 ty = _mm256_mul_ps(fscal,dy00);
1929 tz = _mm256_mul_ps(fscal,dz00);
1931 /* Update vectorial force */
1932 fix0 = _mm256_add_ps(fix0,tx);
1933 fiy0 = _mm256_add_ps(fiy0,ty);
1934 fiz0 = _mm256_add_ps(fiz0,tz);
1936 fjx0 = _mm256_add_ps(fjx0,tx);
1937 fjy0 = _mm256_add_ps(fjy0,ty);
1938 fjz0 = _mm256_add_ps(fjz0,tz);
1940 /**************************
1941 * CALCULATE INTERACTIONS *
1942 **************************/
1944 r11 = _mm256_mul_ps(rsq11,rinv11);
1946 /* Calculate table index by multiplying r with table scale and truncate to integer */
1947 rt = _mm256_mul_ps(r11,vftabscale);
1948 vfitab = _mm256_cvttps_epi32(rt);
1949 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1950 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1951 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1952 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1953 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1954 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1956 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1957 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1958 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1959 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1960 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1961 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1962 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1963 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1964 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1965 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1966 Heps = _mm256_mul_ps(vfeps,H);
1967 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1968 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1969 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1973 /* Calculate temporary vectorial force */
1974 tx = _mm256_mul_ps(fscal,dx11);
1975 ty = _mm256_mul_ps(fscal,dy11);
1976 tz = _mm256_mul_ps(fscal,dz11);
1978 /* Update vectorial force */
1979 fix1 = _mm256_add_ps(fix1,tx);
1980 fiy1 = _mm256_add_ps(fiy1,ty);
1981 fiz1 = _mm256_add_ps(fiz1,tz);
1983 fjx1 = _mm256_add_ps(fjx1,tx);
1984 fjy1 = _mm256_add_ps(fjy1,ty);
1985 fjz1 = _mm256_add_ps(fjz1,tz);
1987 /**************************
1988 * CALCULATE INTERACTIONS *
1989 **************************/
1991 r12 = _mm256_mul_ps(rsq12,rinv12);
1993 /* Calculate table index by multiplying r with table scale and truncate to integer */
1994 rt = _mm256_mul_ps(r12,vftabscale);
1995 vfitab = _mm256_cvttps_epi32(rt);
1996 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1997 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1998 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1999 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2000 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2001 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2003 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2004 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2005 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2006 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2007 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2008 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2009 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2010 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2011 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2012 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2013 Heps = _mm256_mul_ps(vfeps,H);
2014 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2015 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2016 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2020 /* Calculate temporary vectorial force */
2021 tx = _mm256_mul_ps(fscal,dx12);
2022 ty = _mm256_mul_ps(fscal,dy12);
2023 tz = _mm256_mul_ps(fscal,dz12);
2025 /* Update vectorial force */
2026 fix1 = _mm256_add_ps(fix1,tx);
2027 fiy1 = _mm256_add_ps(fiy1,ty);
2028 fiz1 = _mm256_add_ps(fiz1,tz);
2030 fjx2 = _mm256_add_ps(fjx2,tx);
2031 fjy2 = _mm256_add_ps(fjy2,ty);
2032 fjz2 = _mm256_add_ps(fjz2,tz);
2034 /**************************
2035 * CALCULATE INTERACTIONS *
2036 **************************/
2038 r13 = _mm256_mul_ps(rsq13,rinv13);
2040 /* Calculate table index by multiplying r with table scale and truncate to integer */
2041 rt = _mm256_mul_ps(r13,vftabscale);
2042 vfitab = _mm256_cvttps_epi32(rt);
2043 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2044 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2045 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2046 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2047 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2048 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2050 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2051 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2052 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2053 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2054 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2055 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2056 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2057 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2058 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2059 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2060 Heps = _mm256_mul_ps(vfeps,H);
2061 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2062 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2063 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
2067 /* Calculate temporary vectorial force */
2068 tx = _mm256_mul_ps(fscal,dx13);
2069 ty = _mm256_mul_ps(fscal,dy13);
2070 tz = _mm256_mul_ps(fscal,dz13);
2072 /* Update vectorial force */
2073 fix1 = _mm256_add_ps(fix1,tx);
2074 fiy1 = _mm256_add_ps(fiy1,ty);
2075 fiz1 = _mm256_add_ps(fiz1,tz);
2077 fjx3 = _mm256_add_ps(fjx3,tx);
2078 fjy3 = _mm256_add_ps(fjy3,ty);
2079 fjz3 = _mm256_add_ps(fjz3,tz);
2081 /**************************
2082 * CALCULATE INTERACTIONS *
2083 **************************/
2085 r21 = _mm256_mul_ps(rsq21,rinv21);
2087 /* Calculate table index by multiplying r with table scale and truncate to integer */
2088 rt = _mm256_mul_ps(r21,vftabscale);
2089 vfitab = _mm256_cvttps_epi32(rt);
2090 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2091 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2092 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2093 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2094 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2095 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2097 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2098 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2099 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2100 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2101 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2102 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2103 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2104 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2105 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2106 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2107 Heps = _mm256_mul_ps(vfeps,H);
2108 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2109 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2110 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2114 /* Calculate temporary vectorial force */
2115 tx = _mm256_mul_ps(fscal,dx21);
2116 ty = _mm256_mul_ps(fscal,dy21);
2117 tz = _mm256_mul_ps(fscal,dz21);
2119 /* Update vectorial force */
2120 fix2 = _mm256_add_ps(fix2,tx);
2121 fiy2 = _mm256_add_ps(fiy2,ty);
2122 fiz2 = _mm256_add_ps(fiz2,tz);
2124 fjx1 = _mm256_add_ps(fjx1,tx);
2125 fjy1 = _mm256_add_ps(fjy1,ty);
2126 fjz1 = _mm256_add_ps(fjz1,tz);
2128 /**************************
2129 * CALCULATE INTERACTIONS *
2130 **************************/
2132 r22 = _mm256_mul_ps(rsq22,rinv22);
2134 /* Calculate table index by multiplying r with table scale and truncate to integer */
2135 rt = _mm256_mul_ps(r22,vftabscale);
2136 vfitab = _mm256_cvttps_epi32(rt);
2137 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2138 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2139 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2140 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2141 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2142 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2144 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2145 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2146 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2147 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2148 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2149 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2150 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2151 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2152 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2153 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2154 Heps = _mm256_mul_ps(vfeps,H);
2155 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2156 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2157 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2161 /* Calculate temporary vectorial force */
2162 tx = _mm256_mul_ps(fscal,dx22);
2163 ty = _mm256_mul_ps(fscal,dy22);
2164 tz = _mm256_mul_ps(fscal,dz22);
2166 /* Update vectorial force */
2167 fix2 = _mm256_add_ps(fix2,tx);
2168 fiy2 = _mm256_add_ps(fiy2,ty);
2169 fiz2 = _mm256_add_ps(fiz2,tz);
2171 fjx2 = _mm256_add_ps(fjx2,tx);
2172 fjy2 = _mm256_add_ps(fjy2,ty);
2173 fjz2 = _mm256_add_ps(fjz2,tz);
2175 /**************************
2176 * CALCULATE INTERACTIONS *
2177 **************************/
2179 r23 = _mm256_mul_ps(rsq23,rinv23);
2181 /* Calculate table index by multiplying r with table scale and truncate to integer */
2182 rt = _mm256_mul_ps(r23,vftabscale);
2183 vfitab = _mm256_cvttps_epi32(rt);
2184 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2185 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2186 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2187 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2188 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2189 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2191 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2192 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2193 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2194 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2195 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2196 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2197 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2198 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2199 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2200 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2201 Heps = _mm256_mul_ps(vfeps,H);
2202 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2203 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2204 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
2208 /* Calculate temporary vectorial force */
2209 tx = _mm256_mul_ps(fscal,dx23);
2210 ty = _mm256_mul_ps(fscal,dy23);
2211 tz = _mm256_mul_ps(fscal,dz23);
2213 /* Update vectorial force */
2214 fix2 = _mm256_add_ps(fix2,tx);
2215 fiy2 = _mm256_add_ps(fiy2,ty);
2216 fiz2 = _mm256_add_ps(fiz2,tz);
2218 fjx3 = _mm256_add_ps(fjx3,tx);
2219 fjy3 = _mm256_add_ps(fjy3,ty);
2220 fjz3 = _mm256_add_ps(fjz3,tz);
2222 /**************************
2223 * CALCULATE INTERACTIONS *
2224 **************************/
2226 r31 = _mm256_mul_ps(rsq31,rinv31);
2228 /* Calculate table index by multiplying r with table scale and truncate to integer */
2229 rt = _mm256_mul_ps(r31,vftabscale);
2230 vfitab = _mm256_cvttps_epi32(rt);
2231 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2232 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2233 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2234 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2235 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2236 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2238 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2239 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2240 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2241 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2242 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2243 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2244 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2245 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2246 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2247 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2248 Heps = _mm256_mul_ps(vfeps,H);
2249 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2250 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2251 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
2255 /* Calculate temporary vectorial force */
2256 tx = _mm256_mul_ps(fscal,dx31);
2257 ty = _mm256_mul_ps(fscal,dy31);
2258 tz = _mm256_mul_ps(fscal,dz31);
2260 /* Update vectorial force */
2261 fix3 = _mm256_add_ps(fix3,tx);
2262 fiy3 = _mm256_add_ps(fiy3,ty);
2263 fiz3 = _mm256_add_ps(fiz3,tz);
2265 fjx1 = _mm256_add_ps(fjx1,tx);
2266 fjy1 = _mm256_add_ps(fjy1,ty);
2267 fjz1 = _mm256_add_ps(fjz1,tz);
2269 /**************************
2270 * CALCULATE INTERACTIONS *
2271 **************************/
2273 r32 = _mm256_mul_ps(rsq32,rinv32);
2275 /* Calculate table index by multiplying r with table scale and truncate to integer */
2276 rt = _mm256_mul_ps(r32,vftabscale);
2277 vfitab = _mm256_cvttps_epi32(rt);
2278 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2279 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2280 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2281 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2282 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2283 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2285 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2286 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2287 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2288 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2289 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2290 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2291 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2292 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2293 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2294 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2295 Heps = _mm256_mul_ps(vfeps,H);
2296 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2297 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2298 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2302 /* Calculate temporary vectorial force */
2303 tx = _mm256_mul_ps(fscal,dx32);
2304 ty = _mm256_mul_ps(fscal,dy32);
2305 tz = _mm256_mul_ps(fscal,dz32);
2307 /* Update vectorial force */
2308 fix3 = _mm256_add_ps(fix3,tx);
2309 fiy3 = _mm256_add_ps(fiy3,ty);
2310 fiz3 = _mm256_add_ps(fiz3,tz);
2312 fjx2 = _mm256_add_ps(fjx2,tx);
2313 fjy2 = _mm256_add_ps(fjy2,ty);
2314 fjz2 = _mm256_add_ps(fjz2,tz);
2316 /**************************
2317 * CALCULATE INTERACTIONS *
2318 **************************/
2320 r33 = _mm256_mul_ps(rsq33,rinv33);
2322 /* Calculate table index by multiplying r with table scale and truncate to integer */
2323 rt = _mm256_mul_ps(r33,vftabscale);
2324 vfitab = _mm256_cvttps_epi32(rt);
2325 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2326 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2327 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2328 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2329 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2330 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2332 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2333 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2334 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2335 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2336 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2337 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2338 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2339 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2340 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2341 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2342 Heps = _mm256_mul_ps(vfeps,H);
2343 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2344 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2345 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2349 /* Calculate temporary vectorial force */
2350 tx = _mm256_mul_ps(fscal,dx33);
2351 ty = _mm256_mul_ps(fscal,dy33);
2352 tz = _mm256_mul_ps(fscal,dz33);
2354 /* Update vectorial force */
2355 fix3 = _mm256_add_ps(fix3,tx);
2356 fiy3 = _mm256_add_ps(fiy3,ty);
2357 fiz3 = _mm256_add_ps(fiz3,tz);
2359 fjx3 = _mm256_add_ps(fjx3,tx);
2360 fjy3 = _mm256_add_ps(fjy3,ty);
2361 fjz3 = _mm256_add_ps(fjz3,tz);
2363 fjptrA = f+j_coord_offsetA;
2364 fjptrB = f+j_coord_offsetB;
2365 fjptrC = f+j_coord_offsetC;
2366 fjptrD = f+j_coord_offsetD;
2367 fjptrE = f+j_coord_offsetE;
2368 fjptrF = f+j_coord_offsetF;
2369 fjptrG = f+j_coord_offsetG;
2370 fjptrH = f+j_coord_offsetH;
2372 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2373 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2374 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2376 /* Inner loop uses 402 flops */
2379 if(jidx<j_index_end)
2382 /* Get j neighbor index, and coordinate index */
2383 jnrlistA = jjnr[jidx];
2384 jnrlistB = jjnr[jidx+1];
2385 jnrlistC = jjnr[jidx+2];
2386 jnrlistD = jjnr[jidx+3];
2387 jnrlistE = jjnr[jidx+4];
2388 jnrlistF = jjnr[jidx+5];
2389 jnrlistG = jjnr[jidx+6];
2390 jnrlistH = jjnr[jidx+7];
2391 /* Sign of each element will be negative for non-real atoms.
2392 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2393 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2395 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2396 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2398 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2399 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2400 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2401 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2402 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2403 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2404 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2405 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2406 j_coord_offsetA = DIM*jnrA;
2407 j_coord_offsetB = DIM*jnrB;
2408 j_coord_offsetC = DIM*jnrC;
2409 j_coord_offsetD = DIM*jnrD;
2410 j_coord_offsetE = DIM*jnrE;
2411 j_coord_offsetF = DIM*jnrF;
2412 j_coord_offsetG = DIM*jnrG;
2413 j_coord_offsetH = DIM*jnrH;
2415 /* load j atom coordinates */
2416 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2417 x+j_coord_offsetC,x+j_coord_offsetD,
2418 x+j_coord_offsetE,x+j_coord_offsetF,
2419 x+j_coord_offsetG,x+j_coord_offsetH,
2420 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2421 &jy2,&jz2,&jx3,&jy3,&jz3);
2423 /* Calculate displacement vector */
2424 dx00 = _mm256_sub_ps(ix0,jx0);
2425 dy00 = _mm256_sub_ps(iy0,jy0);
2426 dz00 = _mm256_sub_ps(iz0,jz0);
2427 dx11 = _mm256_sub_ps(ix1,jx1);
2428 dy11 = _mm256_sub_ps(iy1,jy1);
2429 dz11 = _mm256_sub_ps(iz1,jz1);
2430 dx12 = _mm256_sub_ps(ix1,jx2);
2431 dy12 = _mm256_sub_ps(iy1,jy2);
2432 dz12 = _mm256_sub_ps(iz1,jz2);
2433 dx13 = _mm256_sub_ps(ix1,jx3);
2434 dy13 = _mm256_sub_ps(iy1,jy3);
2435 dz13 = _mm256_sub_ps(iz1,jz3);
2436 dx21 = _mm256_sub_ps(ix2,jx1);
2437 dy21 = _mm256_sub_ps(iy2,jy1);
2438 dz21 = _mm256_sub_ps(iz2,jz1);
2439 dx22 = _mm256_sub_ps(ix2,jx2);
2440 dy22 = _mm256_sub_ps(iy2,jy2);
2441 dz22 = _mm256_sub_ps(iz2,jz2);
2442 dx23 = _mm256_sub_ps(ix2,jx3);
2443 dy23 = _mm256_sub_ps(iy2,jy3);
2444 dz23 = _mm256_sub_ps(iz2,jz3);
2445 dx31 = _mm256_sub_ps(ix3,jx1);
2446 dy31 = _mm256_sub_ps(iy3,jy1);
2447 dz31 = _mm256_sub_ps(iz3,jz1);
2448 dx32 = _mm256_sub_ps(ix3,jx2);
2449 dy32 = _mm256_sub_ps(iy3,jy2);
2450 dz32 = _mm256_sub_ps(iz3,jz2);
2451 dx33 = _mm256_sub_ps(ix3,jx3);
2452 dy33 = _mm256_sub_ps(iy3,jy3);
2453 dz33 = _mm256_sub_ps(iz3,jz3);
2455 /* Calculate squared distance and things based on it */
2456 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2457 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2458 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2459 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2460 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2461 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2462 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2463 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2464 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2465 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2467 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2468 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2469 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2470 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2471 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2472 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2473 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2474 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2475 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2476 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2478 fjx0 = _mm256_setzero_ps();
2479 fjy0 = _mm256_setzero_ps();
2480 fjz0 = _mm256_setzero_ps();
2481 fjx1 = _mm256_setzero_ps();
2482 fjy1 = _mm256_setzero_ps();
2483 fjz1 = _mm256_setzero_ps();
2484 fjx2 = _mm256_setzero_ps();
2485 fjy2 = _mm256_setzero_ps();
2486 fjz2 = _mm256_setzero_ps();
2487 fjx3 = _mm256_setzero_ps();
2488 fjy3 = _mm256_setzero_ps();
2489 fjz3 = _mm256_setzero_ps();
2491 /**************************
2492 * CALCULATE INTERACTIONS *
2493 **************************/
2495 r00 = _mm256_mul_ps(rsq00,rinv00);
2496 r00 = _mm256_andnot_ps(dummy_mask,r00);
2498 /* Calculate table index by multiplying r with table scale and truncate to integer */
2499 rt = _mm256_mul_ps(r00,vftabscale);
2500 vfitab = _mm256_cvttps_epi32(rt);
2501 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2502 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2503 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2504 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2505 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2506 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2508 /* CUBIC SPLINE TABLE DISPERSION */
2509 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2510 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2511 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2512 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2513 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2514 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2515 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2516 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2517 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2518 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2519 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2520 Heps = _mm256_mul_ps(vfeps,H);
2521 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2522 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2523 fvdw6 = _mm256_mul_ps(c6_00,FF);
2525 /* CUBIC SPLINE TABLE REPULSION */
2526 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2527 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2528 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2529 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2530 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2531 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2532 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2533 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2534 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2535 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2536 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2537 Heps = _mm256_mul_ps(vfeps,H);
2538 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2539 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2540 fvdw12 = _mm256_mul_ps(c12_00,FF);
2541 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
2545 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2547 /* Calculate temporary vectorial force */
2548 tx = _mm256_mul_ps(fscal,dx00);
2549 ty = _mm256_mul_ps(fscal,dy00);
2550 tz = _mm256_mul_ps(fscal,dz00);
2552 /* Update vectorial force */
2553 fix0 = _mm256_add_ps(fix0,tx);
2554 fiy0 = _mm256_add_ps(fiy0,ty);
2555 fiz0 = _mm256_add_ps(fiz0,tz);
2557 fjx0 = _mm256_add_ps(fjx0,tx);
2558 fjy0 = _mm256_add_ps(fjy0,ty);
2559 fjz0 = _mm256_add_ps(fjz0,tz);
2561 /**************************
2562 * CALCULATE INTERACTIONS *
2563 **************************/
2565 r11 = _mm256_mul_ps(rsq11,rinv11);
2566 r11 = _mm256_andnot_ps(dummy_mask,r11);
2568 /* Calculate table index by multiplying r with table scale and truncate to integer */
2569 rt = _mm256_mul_ps(r11,vftabscale);
2570 vfitab = _mm256_cvttps_epi32(rt);
2571 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2572 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2573 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2574 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2575 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2576 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2578 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2579 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2580 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2581 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2582 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2583 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2584 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2585 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2586 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2587 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2588 Heps = _mm256_mul_ps(vfeps,H);
2589 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2590 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2591 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2595 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2597 /* Calculate temporary vectorial force */
2598 tx = _mm256_mul_ps(fscal,dx11);
2599 ty = _mm256_mul_ps(fscal,dy11);
2600 tz = _mm256_mul_ps(fscal,dz11);
2602 /* Update vectorial force */
2603 fix1 = _mm256_add_ps(fix1,tx);
2604 fiy1 = _mm256_add_ps(fiy1,ty);
2605 fiz1 = _mm256_add_ps(fiz1,tz);
2607 fjx1 = _mm256_add_ps(fjx1,tx);
2608 fjy1 = _mm256_add_ps(fjy1,ty);
2609 fjz1 = _mm256_add_ps(fjz1,tz);
2611 /**************************
2612 * CALCULATE INTERACTIONS *
2613 **************************/
2615 r12 = _mm256_mul_ps(rsq12,rinv12);
2616 r12 = _mm256_andnot_ps(dummy_mask,r12);
2618 /* Calculate table index by multiplying r with table scale and truncate to integer */
2619 rt = _mm256_mul_ps(r12,vftabscale);
2620 vfitab = _mm256_cvttps_epi32(rt);
2621 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2622 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2623 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2624 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2625 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2626 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2628 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2629 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2630 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2631 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2632 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2633 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2634 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2635 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2636 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2637 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2638 Heps = _mm256_mul_ps(vfeps,H);
2639 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2640 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2641 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2645 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2647 /* Calculate temporary vectorial force */
2648 tx = _mm256_mul_ps(fscal,dx12);
2649 ty = _mm256_mul_ps(fscal,dy12);
2650 tz = _mm256_mul_ps(fscal,dz12);
2652 /* Update vectorial force */
2653 fix1 = _mm256_add_ps(fix1,tx);
2654 fiy1 = _mm256_add_ps(fiy1,ty);
2655 fiz1 = _mm256_add_ps(fiz1,tz);
2657 fjx2 = _mm256_add_ps(fjx2,tx);
2658 fjy2 = _mm256_add_ps(fjy2,ty);
2659 fjz2 = _mm256_add_ps(fjz2,tz);
2661 /**************************
2662 * CALCULATE INTERACTIONS *
2663 **************************/
2665 r13 = _mm256_mul_ps(rsq13,rinv13);
2666 r13 = _mm256_andnot_ps(dummy_mask,r13);
2668 /* Calculate table index by multiplying r with table scale and truncate to integer */
2669 rt = _mm256_mul_ps(r13,vftabscale);
2670 vfitab = _mm256_cvttps_epi32(rt);
2671 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2672 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2673 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2674 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2675 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2676 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2678 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2679 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2680 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2681 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2682 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2683 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2684 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2685 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2686 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2687 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2688 Heps = _mm256_mul_ps(vfeps,H);
2689 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2690 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2691 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
2695 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2697 /* Calculate temporary vectorial force */
2698 tx = _mm256_mul_ps(fscal,dx13);
2699 ty = _mm256_mul_ps(fscal,dy13);
2700 tz = _mm256_mul_ps(fscal,dz13);
2702 /* Update vectorial force */
2703 fix1 = _mm256_add_ps(fix1,tx);
2704 fiy1 = _mm256_add_ps(fiy1,ty);
2705 fiz1 = _mm256_add_ps(fiz1,tz);
2707 fjx3 = _mm256_add_ps(fjx3,tx);
2708 fjy3 = _mm256_add_ps(fjy3,ty);
2709 fjz3 = _mm256_add_ps(fjz3,tz);
2711 /**************************
2712 * CALCULATE INTERACTIONS *
2713 **************************/
2715 r21 = _mm256_mul_ps(rsq21,rinv21);
2716 r21 = _mm256_andnot_ps(dummy_mask,r21);
2718 /* Calculate table index by multiplying r with table scale and truncate to integer */
2719 rt = _mm256_mul_ps(r21,vftabscale);
2720 vfitab = _mm256_cvttps_epi32(rt);
2721 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2722 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2723 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2724 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2725 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2726 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2728 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2729 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2730 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2731 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2732 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2733 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2734 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2735 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2736 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2737 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2738 Heps = _mm256_mul_ps(vfeps,H);
2739 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2740 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2741 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2745 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2747 /* Calculate temporary vectorial force */
2748 tx = _mm256_mul_ps(fscal,dx21);
2749 ty = _mm256_mul_ps(fscal,dy21);
2750 tz = _mm256_mul_ps(fscal,dz21);
2752 /* Update vectorial force */
2753 fix2 = _mm256_add_ps(fix2,tx);
2754 fiy2 = _mm256_add_ps(fiy2,ty);
2755 fiz2 = _mm256_add_ps(fiz2,tz);
2757 fjx1 = _mm256_add_ps(fjx1,tx);
2758 fjy1 = _mm256_add_ps(fjy1,ty);
2759 fjz1 = _mm256_add_ps(fjz1,tz);
2761 /**************************
2762 * CALCULATE INTERACTIONS *
2763 **************************/
2765 r22 = _mm256_mul_ps(rsq22,rinv22);
2766 r22 = _mm256_andnot_ps(dummy_mask,r22);
2768 /* Calculate table index by multiplying r with table scale and truncate to integer */
2769 rt = _mm256_mul_ps(r22,vftabscale);
2770 vfitab = _mm256_cvttps_epi32(rt);
2771 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2772 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2773 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2774 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2775 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2776 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2778 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2779 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2780 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2781 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2782 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2783 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2784 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2785 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2786 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2787 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2788 Heps = _mm256_mul_ps(vfeps,H);
2789 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2790 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2791 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2795 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2797 /* Calculate temporary vectorial force */
2798 tx = _mm256_mul_ps(fscal,dx22);
2799 ty = _mm256_mul_ps(fscal,dy22);
2800 tz = _mm256_mul_ps(fscal,dz22);
2802 /* Update vectorial force */
2803 fix2 = _mm256_add_ps(fix2,tx);
2804 fiy2 = _mm256_add_ps(fiy2,ty);
2805 fiz2 = _mm256_add_ps(fiz2,tz);
2807 fjx2 = _mm256_add_ps(fjx2,tx);
2808 fjy2 = _mm256_add_ps(fjy2,ty);
2809 fjz2 = _mm256_add_ps(fjz2,tz);
2811 /**************************
2812 * CALCULATE INTERACTIONS *
2813 **************************/
2815 r23 = _mm256_mul_ps(rsq23,rinv23);
2816 r23 = _mm256_andnot_ps(dummy_mask,r23);
2818 /* Calculate table index by multiplying r with table scale and truncate to integer */
2819 rt = _mm256_mul_ps(r23,vftabscale);
2820 vfitab = _mm256_cvttps_epi32(rt);
2821 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2822 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2823 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2824 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2825 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2826 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2828 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2829 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2830 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2831 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2832 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2833 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2834 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2835 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2836 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2837 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2838 Heps = _mm256_mul_ps(vfeps,H);
2839 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2840 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2841 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
2845 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2847 /* Calculate temporary vectorial force */
2848 tx = _mm256_mul_ps(fscal,dx23);
2849 ty = _mm256_mul_ps(fscal,dy23);
2850 tz = _mm256_mul_ps(fscal,dz23);
2852 /* Update vectorial force */
2853 fix2 = _mm256_add_ps(fix2,tx);
2854 fiy2 = _mm256_add_ps(fiy2,ty);
2855 fiz2 = _mm256_add_ps(fiz2,tz);
2857 fjx3 = _mm256_add_ps(fjx3,tx);
2858 fjy3 = _mm256_add_ps(fjy3,ty);
2859 fjz3 = _mm256_add_ps(fjz3,tz);
2861 /**************************
2862 * CALCULATE INTERACTIONS *
2863 **************************/
2865 r31 = _mm256_mul_ps(rsq31,rinv31);
2866 r31 = _mm256_andnot_ps(dummy_mask,r31);
2868 /* Calculate table index by multiplying r with table scale and truncate to integer */
2869 rt = _mm256_mul_ps(r31,vftabscale);
2870 vfitab = _mm256_cvttps_epi32(rt);
2871 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2872 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2873 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2874 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2875 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2876 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2878 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2879 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2880 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2881 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2882 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2883 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2884 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2885 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2886 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2887 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2888 Heps = _mm256_mul_ps(vfeps,H);
2889 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2890 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2891 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
2895 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2897 /* Calculate temporary vectorial force */
2898 tx = _mm256_mul_ps(fscal,dx31);
2899 ty = _mm256_mul_ps(fscal,dy31);
2900 tz = _mm256_mul_ps(fscal,dz31);
2902 /* Update vectorial force */
2903 fix3 = _mm256_add_ps(fix3,tx);
2904 fiy3 = _mm256_add_ps(fiy3,ty);
2905 fiz3 = _mm256_add_ps(fiz3,tz);
2907 fjx1 = _mm256_add_ps(fjx1,tx);
2908 fjy1 = _mm256_add_ps(fjy1,ty);
2909 fjz1 = _mm256_add_ps(fjz1,tz);
2911 /**************************
2912 * CALCULATE INTERACTIONS *
2913 **************************/
2915 r32 = _mm256_mul_ps(rsq32,rinv32);
2916 r32 = _mm256_andnot_ps(dummy_mask,r32);
2918 /* Calculate table index by multiplying r with table scale and truncate to integer */
2919 rt = _mm256_mul_ps(r32,vftabscale);
2920 vfitab = _mm256_cvttps_epi32(rt);
2921 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2922 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2923 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2924 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2925 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2926 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2928 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2929 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2930 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2931 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2932 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2933 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2934 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2935 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2936 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2937 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2938 Heps = _mm256_mul_ps(vfeps,H);
2939 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2940 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2941 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2945 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2947 /* Calculate temporary vectorial force */
2948 tx = _mm256_mul_ps(fscal,dx32);
2949 ty = _mm256_mul_ps(fscal,dy32);
2950 tz = _mm256_mul_ps(fscal,dz32);
2952 /* Update vectorial force */
2953 fix3 = _mm256_add_ps(fix3,tx);
2954 fiy3 = _mm256_add_ps(fiy3,ty);
2955 fiz3 = _mm256_add_ps(fiz3,tz);
2957 fjx2 = _mm256_add_ps(fjx2,tx);
2958 fjy2 = _mm256_add_ps(fjy2,ty);
2959 fjz2 = _mm256_add_ps(fjz2,tz);
2961 /**************************
2962 * CALCULATE INTERACTIONS *
2963 **************************/
2965 r33 = _mm256_mul_ps(rsq33,rinv33);
2966 r33 = _mm256_andnot_ps(dummy_mask,r33);
2968 /* Calculate table index by multiplying r with table scale and truncate to integer */
2969 rt = _mm256_mul_ps(r33,vftabscale);
2970 vfitab = _mm256_cvttps_epi32(rt);
2971 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2972 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2973 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2974 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2975 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2976 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2978 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2979 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2980 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2981 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2982 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2983 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2984 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2985 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2986 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2987 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2988 Heps = _mm256_mul_ps(vfeps,H);
2989 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2990 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2991 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2995 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2997 /* Calculate temporary vectorial force */
2998 tx = _mm256_mul_ps(fscal,dx33);
2999 ty = _mm256_mul_ps(fscal,dy33);
3000 tz = _mm256_mul_ps(fscal,dz33);
3002 /* Update vectorial force */
3003 fix3 = _mm256_add_ps(fix3,tx);
3004 fiy3 = _mm256_add_ps(fiy3,ty);
3005 fiz3 = _mm256_add_ps(fiz3,tz);
3007 fjx3 = _mm256_add_ps(fjx3,tx);
3008 fjy3 = _mm256_add_ps(fjy3,ty);
3009 fjz3 = _mm256_add_ps(fjz3,tz);
3011 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3012 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3013 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3014 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3015 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3016 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3017 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3018 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3020 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
3021 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3022 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3024 /* Inner loop uses 412 flops */
3027 /* End of innermost loop */
3029 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3030 f+i_coord_offset,fshift+i_shift_offset);
3032 /* Increment number of inner iterations */
3033 inneriter += j_index_end - j_index_start;
3035 /* Outer loop uses 24 flops */
3038 /* Increment number of outer iterations */
3041 /* Update outer/inner flops */
3043 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*412);