2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_single.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single
51 * Electrostatics interaction: CubicSplineTable
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water3-Water3
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrE,jnrF,jnrG,jnrH;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
79 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
81 real *shiftvec,*fshift,*x,*f;
82 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
84 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
85 real * vdwioffsetptr0;
86 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
87 real * vdwioffsetptr1;
88 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
89 real * vdwioffsetptr2;
90 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
92 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
94 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
96 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
98 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
99 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
100 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
101 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
102 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
103 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
104 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
105 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
106 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
109 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
112 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
113 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
115 __m128i vfitab_lo,vfitab_hi;
116 __m128i ifour = _mm_set1_epi32(4);
117 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
119 __m256 dummy_mask,cutoff_mask;
120 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
121 __m256 one = _mm256_set1_ps(1.0);
122 __m256 two = _mm256_set1_ps(2.0);
128 jindex = nlist->jindex;
130 shiftidx = nlist->shift;
132 shiftvec = fr->shift_vec[0];
133 fshift = fr->fshift[0];
134 facel = _mm256_set1_ps(fr->ic->epsfac);
135 charge = mdatoms->chargeA;
136 nvdwtype = fr->ntype;
138 vdwtype = mdatoms->typeA;
140 vftab = kernel_data->table_elec_vdw->data;
141 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
143 /* Setup water-specific parameters */
144 inr = nlist->iinr[0];
145 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
146 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
147 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
148 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
150 jq0 = _mm256_set1_ps(charge[inr+0]);
151 jq1 = _mm256_set1_ps(charge[inr+1]);
152 jq2 = _mm256_set1_ps(charge[inr+2]);
153 vdwjidx0A = 2*vdwtype[inr+0];
154 qq00 = _mm256_mul_ps(iq0,jq0);
155 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
156 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
157 qq01 = _mm256_mul_ps(iq0,jq1);
158 qq02 = _mm256_mul_ps(iq0,jq2);
159 qq10 = _mm256_mul_ps(iq1,jq0);
160 qq11 = _mm256_mul_ps(iq1,jq1);
161 qq12 = _mm256_mul_ps(iq1,jq2);
162 qq20 = _mm256_mul_ps(iq2,jq0);
163 qq21 = _mm256_mul_ps(iq2,jq1);
164 qq22 = _mm256_mul_ps(iq2,jq2);
166 /* Avoid stupid compiler warnings */
167 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
180 for(iidx=0;iidx<4*DIM;iidx++)
185 /* Start outer loop over neighborlists */
186 for(iidx=0; iidx<nri; iidx++)
188 /* Load shift vector for this list */
189 i_shift_offset = DIM*shiftidx[iidx];
191 /* Load limits for loop over neighbors */
192 j_index_start = jindex[iidx];
193 j_index_end = jindex[iidx+1];
195 /* Get outer coordinate index */
197 i_coord_offset = DIM*inr;
199 /* Load i particle coords and add shift vector */
200 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
201 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
203 fix0 = _mm256_setzero_ps();
204 fiy0 = _mm256_setzero_ps();
205 fiz0 = _mm256_setzero_ps();
206 fix1 = _mm256_setzero_ps();
207 fiy1 = _mm256_setzero_ps();
208 fiz1 = _mm256_setzero_ps();
209 fix2 = _mm256_setzero_ps();
210 fiy2 = _mm256_setzero_ps();
211 fiz2 = _mm256_setzero_ps();
213 /* Reset potential sums */
214 velecsum = _mm256_setzero_ps();
215 vvdwsum = _mm256_setzero_ps();
217 /* Start inner kernel loop */
218 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
221 /* Get j neighbor index, and coordinate index */
230 j_coord_offsetA = DIM*jnrA;
231 j_coord_offsetB = DIM*jnrB;
232 j_coord_offsetC = DIM*jnrC;
233 j_coord_offsetD = DIM*jnrD;
234 j_coord_offsetE = DIM*jnrE;
235 j_coord_offsetF = DIM*jnrF;
236 j_coord_offsetG = DIM*jnrG;
237 j_coord_offsetH = DIM*jnrH;
239 /* load j atom coordinates */
240 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
241 x+j_coord_offsetC,x+j_coord_offsetD,
242 x+j_coord_offsetE,x+j_coord_offsetF,
243 x+j_coord_offsetG,x+j_coord_offsetH,
244 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
246 /* Calculate displacement vector */
247 dx00 = _mm256_sub_ps(ix0,jx0);
248 dy00 = _mm256_sub_ps(iy0,jy0);
249 dz00 = _mm256_sub_ps(iz0,jz0);
250 dx01 = _mm256_sub_ps(ix0,jx1);
251 dy01 = _mm256_sub_ps(iy0,jy1);
252 dz01 = _mm256_sub_ps(iz0,jz1);
253 dx02 = _mm256_sub_ps(ix0,jx2);
254 dy02 = _mm256_sub_ps(iy0,jy2);
255 dz02 = _mm256_sub_ps(iz0,jz2);
256 dx10 = _mm256_sub_ps(ix1,jx0);
257 dy10 = _mm256_sub_ps(iy1,jy0);
258 dz10 = _mm256_sub_ps(iz1,jz0);
259 dx11 = _mm256_sub_ps(ix1,jx1);
260 dy11 = _mm256_sub_ps(iy1,jy1);
261 dz11 = _mm256_sub_ps(iz1,jz1);
262 dx12 = _mm256_sub_ps(ix1,jx2);
263 dy12 = _mm256_sub_ps(iy1,jy2);
264 dz12 = _mm256_sub_ps(iz1,jz2);
265 dx20 = _mm256_sub_ps(ix2,jx0);
266 dy20 = _mm256_sub_ps(iy2,jy0);
267 dz20 = _mm256_sub_ps(iz2,jz0);
268 dx21 = _mm256_sub_ps(ix2,jx1);
269 dy21 = _mm256_sub_ps(iy2,jy1);
270 dz21 = _mm256_sub_ps(iz2,jz1);
271 dx22 = _mm256_sub_ps(ix2,jx2);
272 dy22 = _mm256_sub_ps(iy2,jy2);
273 dz22 = _mm256_sub_ps(iz2,jz2);
275 /* Calculate squared distance and things based on it */
276 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
277 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
278 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
279 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
280 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
281 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
282 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
283 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
284 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
286 rinv00 = avx256_invsqrt_f(rsq00);
287 rinv01 = avx256_invsqrt_f(rsq01);
288 rinv02 = avx256_invsqrt_f(rsq02);
289 rinv10 = avx256_invsqrt_f(rsq10);
290 rinv11 = avx256_invsqrt_f(rsq11);
291 rinv12 = avx256_invsqrt_f(rsq12);
292 rinv20 = avx256_invsqrt_f(rsq20);
293 rinv21 = avx256_invsqrt_f(rsq21);
294 rinv22 = avx256_invsqrt_f(rsq22);
296 fjx0 = _mm256_setzero_ps();
297 fjy0 = _mm256_setzero_ps();
298 fjz0 = _mm256_setzero_ps();
299 fjx1 = _mm256_setzero_ps();
300 fjy1 = _mm256_setzero_ps();
301 fjz1 = _mm256_setzero_ps();
302 fjx2 = _mm256_setzero_ps();
303 fjy2 = _mm256_setzero_ps();
304 fjz2 = _mm256_setzero_ps();
306 /**************************
307 * CALCULATE INTERACTIONS *
308 **************************/
310 r00 = _mm256_mul_ps(rsq00,rinv00);
312 /* Calculate table index by multiplying r with table scale and truncate to integer */
313 rt = _mm256_mul_ps(r00,vftabscale);
314 vfitab = _mm256_cvttps_epi32(rt);
315 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
316 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
317 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
318 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
319 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
320 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
322 /* CUBIC SPLINE TABLE ELECTROSTATICS */
323 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
324 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
325 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
326 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
327 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
328 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
329 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
330 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
331 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
332 Heps = _mm256_mul_ps(vfeps,H);
333 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
334 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
335 velec = _mm256_mul_ps(qq00,VV);
336 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
337 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
339 /* CUBIC SPLINE TABLE DISPERSION */
340 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
341 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
342 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
343 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
344 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
345 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
346 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
347 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
348 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
349 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
350 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
351 Heps = _mm256_mul_ps(vfeps,H);
352 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
353 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
354 vvdw6 = _mm256_mul_ps(c6_00,VV);
355 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
356 fvdw6 = _mm256_mul_ps(c6_00,FF);
358 /* CUBIC SPLINE TABLE REPULSION */
359 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
360 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
361 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
362 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
363 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
364 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
365 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
366 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
367 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
368 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
369 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
370 Heps = _mm256_mul_ps(vfeps,H);
371 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
372 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
373 vvdw12 = _mm256_mul_ps(c12_00,VV);
374 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
375 fvdw12 = _mm256_mul_ps(c12_00,FF);
376 vvdw = _mm256_add_ps(vvdw12,vvdw6);
377 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
379 /* Update potential sum for this i atom from the interaction with this j atom. */
380 velecsum = _mm256_add_ps(velecsum,velec);
381 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
383 fscal = _mm256_add_ps(felec,fvdw);
385 /* Calculate temporary vectorial force */
386 tx = _mm256_mul_ps(fscal,dx00);
387 ty = _mm256_mul_ps(fscal,dy00);
388 tz = _mm256_mul_ps(fscal,dz00);
390 /* Update vectorial force */
391 fix0 = _mm256_add_ps(fix0,tx);
392 fiy0 = _mm256_add_ps(fiy0,ty);
393 fiz0 = _mm256_add_ps(fiz0,tz);
395 fjx0 = _mm256_add_ps(fjx0,tx);
396 fjy0 = _mm256_add_ps(fjy0,ty);
397 fjz0 = _mm256_add_ps(fjz0,tz);
399 /**************************
400 * CALCULATE INTERACTIONS *
401 **************************/
403 r01 = _mm256_mul_ps(rsq01,rinv01);
405 /* Calculate table index by multiplying r with table scale and truncate to integer */
406 rt = _mm256_mul_ps(r01,vftabscale);
407 vfitab = _mm256_cvttps_epi32(rt);
408 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
409 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
410 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
411 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
412 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
413 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
415 /* CUBIC SPLINE TABLE ELECTROSTATICS */
416 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
417 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
418 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
419 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
420 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
421 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
422 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
423 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
424 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
425 Heps = _mm256_mul_ps(vfeps,H);
426 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
427 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
428 velec = _mm256_mul_ps(qq01,VV);
429 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
430 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velecsum = _mm256_add_ps(velecsum,velec);
437 /* Calculate temporary vectorial force */
438 tx = _mm256_mul_ps(fscal,dx01);
439 ty = _mm256_mul_ps(fscal,dy01);
440 tz = _mm256_mul_ps(fscal,dz01);
442 /* Update vectorial force */
443 fix0 = _mm256_add_ps(fix0,tx);
444 fiy0 = _mm256_add_ps(fiy0,ty);
445 fiz0 = _mm256_add_ps(fiz0,tz);
447 fjx1 = _mm256_add_ps(fjx1,tx);
448 fjy1 = _mm256_add_ps(fjy1,ty);
449 fjz1 = _mm256_add_ps(fjz1,tz);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 r02 = _mm256_mul_ps(rsq02,rinv02);
457 /* Calculate table index by multiplying r with table scale and truncate to integer */
458 rt = _mm256_mul_ps(r02,vftabscale);
459 vfitab = _mm256_cvttps_epi32(rt);
460 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
461 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
462 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
463 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
464 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
465 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
467 /* CUBIC SPLINE TABLE ELECTROSTATICS */
468 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
469 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
470 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
471 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
472 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
473 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
474 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
475 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
476 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
477 Heps = _mm256_mul_ps(vfeps,H);
478 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
479 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
480 velec = _mm256_mul_ps(qq02,VV);
481 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
482 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
484 /* Update potential sum for this i atom from the interaction with this j atom. */
485 velecsum = _mm256_add_ps(velecsum,velec);
489 /* Calculate temporary vectorial force */
490 tx = _mm256_mul_ps(fscal,dx02);
491 ty = _mm256_mul_ps(fscal,dy02);
492 tz = _mm256_mul_ps(fscal,dz02);
494 /* Update vectorial force */
495 fix0 = _mm256_add_ps(fix0,tx);
496 fiy0 = _mm256_add_ps(fiy0,ty);
497 fiz0 = _mm256_add_ps(fiz0,tz);
499 fjx2 = _mm256_add_ps(fjx2,tx);
500 fjy2 = _mm256_add_ps(fjy2,ty);
501 fjz2 = _mm256_add_ps(fjz2,tz);
503 /**************************
504 * CALCULATE INTERACTIONS *
505 **************************/
507 r10 = _mm256_mul_ps(rsq10,rinv10);
509 /* Calculate table index by multiplying r with table scale and truncate to integer */
510 rt = _mm256_mul_ps(r10,vftabscale);
511 vfitab = _mm256_cvttps_epi32(rt);
512 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
513 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
514 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
515 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
516 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
517 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
519 /* CUBIC SPLINE TABLE ELECTROSTATICS */
520 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
521 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
522 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
523 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
524 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
525 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
526 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
527 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
528 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
529 Heps = _mm256_mul_ps(vfeps,H);
530 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
531 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
532 velec = _mm256_mul_ps(qq10,VV);
533 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
534 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
536 /* Update potential sum for this i atom from the interaction with this j atom. */
537 velecsum = _mm256_add_ps(velecsum,velec);
541 /* Calculate temporary vectorial force */
542 tx = _mm256_mul_ps(fscal,dx10);
543 ty = _mm256_mul_ps(fscal,dy10);
544 tz = _mm256_mul_ps(fscal,dz10);
546 /* Update vectorial force */
547 fix1 = _mm256_add_ps(fix1,tx);
548 fiy1 = _mm256_add_ps(fiy1,ty);
549 fiz1 = _mm256_add_ps(fiz1,tz);
551 fjx0 = _mm256_add_ps(fjx0,tx);
552 fjy0 = _mm256_add_ps(fjy0,ty);
553 fjz0 = _mm256_add_ps(fjz0,tz);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 r11 = _mm256_mul_ps(rsq11,rinv11);
561 /* Calculate table index by multiplying r with table scale and truncate to integer */
562 rt = _mm256_mul_ps(r11,vftabscale);
563 vfitab = _mm256_cvttps_epi32(rt);
564 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
565 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
566 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
567 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
568 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
569 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
571 /* CUBIC SPLINE TABLE ELECTROSTATICS */
572 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
573 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
574 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
575 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
576 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
577 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
578 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
579 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
580 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
581 Heps = _mm256_mul_ps(vfeps,H);
582 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
583 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
584 velec = _mm256_mul_ps(qq11,VV);
585 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
586 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
588 /* Update potential sum for this i atom from the interaction with this j atom. */
589 velecsum = _mm256_add_ps(velecsum,velec);
593 /* Calculate temporary vectorial force */
594 tx = _mm256_mul_ps(fscal,dx11);
595 ty = _mm256_mul_ps(fscal,dy11);
596 tz = _mm256_mul_ps(fscal,dz11);
598 /* Update vectorial force */
599 fix1 = _mm256_add_ps(fix1,tx);
600 fiy1 = _mm256_add_ps(fiy1,ty);
601 fiz1 = _mm256_add_ps(fiz1,tz);
603 fjx1 = _mm256_add_ps(fjx1,tx);
604 fjy1 = _mm256_add_ps(fjy1,ty);
605 fjz1 = _mm256_add_ps(fjz1,tz);
607 /**************************
608 * CALCULATE INTERACTIONS *
609 **************************/
611 r12 = _mm256_mul_ps(rsq12,rinv12);
613 /* Calculate table index by multiplying r with table scale and truncate to integer */
614 rt = _mm256_mul_ps(r12,vftabscale);
615 vfitab = _mm256_cvttps_epi32(rt);
616 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
617 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
618 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
619 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
620 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
621 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
623 /* CUBIC SPLINE TABLE ELECTROSTATICS */
624 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
625 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
626 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
627 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
628 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
629 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
630 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
631 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
632 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
633 Heps = _mm256_mul_ps(vfeps,H);
634 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
635 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
636 velec = _mm256_mul_ps(qq12,VV);
637 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
638 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
640 /* Update potential sum for this i atom from the interaction with this j atom. */
641 velecsum = _mm256_add_ps(velecsum,velec);
645 /* Calculate temporary vectorial force */
646 tx = _mm256_mul_ps(fscal,dx12);
647 ty = _mm256_mul_ps(fscal,dy12);
648 tz = _mm256_mul_ps(fscal,dz12);
650 /* Update vectorial force */
651 fix1 = _mm256_add_ps(fix1,tx);
652 fiy1 = _mm256_add_ps(fiy1,ty);
653 fiz1 = _mm256_add_ps(fiz1,tz);
655 fjx2 = _mm256_add_ps(fjx2,tx);
656 fjy2 = _mm256_add_ps(fjy2,ty);
657 fjz2 = _mm256_add_ps(fjz2,tz);
659 /**************************
660 * CALCULATE INTERACTIONS *
661 **************************/
663 r20 = _mm256_mul_ps(rsq20,rinv20);
665 /* Calculate table index by multiplying r with table scale and truncate to integer */
666 rt = _mm256_mul_ps(r20,vftabscale);
667 vfitab = _mm256_cvttps_epi32(rt);
668 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
669 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
670 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
671 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
672 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
673 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
675 /* CUBIC SPLINE TABLE ELECTROSTATICS */
676 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
677 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
678 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
679 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
680 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
681 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
682 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
683 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
684 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
685 Heps = _mm256_mul_ps(vfeps,H);
686 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
687 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
688 velec = _mm256_mul_ps(qq20,VV);
689 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
690 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
692 /* Update potential sum for this i atom from the interaction with this j atom. */
693 velecsum = _mm256_add_ps(velecsum,velec);
697 /* Calculate temporary vectorial force */
698 tx = _mm256_mul_ps(fscal,dx20);
699 ty = _mm256_mul_ps(fscal,dy20);
700 tz = _mm256_mul_ps(fscal,dz20);
702 /* Update vectorial force */
703 fix2 = _mm256_add_ps(fix2,tx);
704 fiy2 = _mm256_add_ps(fiy2,ty);
705 fiz2 = _mm256_add_ps(fiz2,tz);
707 fjx0 = _mm256_add_ps(fjx0,tx);
708 fjy0 = _mm256_add_ps(fjy0,ty);
709 fjz0 = _mm256_add_ps(fjz0,tz);
711 /**************************
712 * CALCULATE INTERACTIONS *
713 **************************/
715 r21 = _mm256_mul_ps(rsq21,rinv21);
717 /* Calculate table index by multiplying r with table scale and truncate to integer */
718 rt = _mm256_mul_ps(r21,vftabscale);
719 vfitab = _mm256_cvttps_epi32(rt);
720 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
721 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
722 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
723 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
724 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
725 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
727 /* CUBIC SPLINE TABLE ELECTROSTATICS */
728 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
729 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
730 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
731 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
732 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
733 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
734 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
735 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
736 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
737 Heps = _mm256_mul_ps(vfeps,H);
738 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
739 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
740 velec = _mm256_mul_ps(qq21,VV);
741 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
742 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
744 /* Update potential sum for this i atom from the interaction with this j atom. */
745 velecsum = _mm256_add_ps(velecsum,velec);
749 /* Calculate temporary vectorial force */
750 tx = _mm256_mul_ps(fscal,dx21);
751 ty = _mm256_mul_ps(fscal,dy21);
752 tz = _mm256_mul_ps(fscal,dz21);
754 /* Update vectorial force */
755 fix2 = _mm256_add_ps(fix2,tx);
756 fiy2 = _mm256_add_ps(fiy2,ty);
757 fiz2 = _mm256_add_ps(fiz2,tz);
759 fjx1 = _mm256_add_ps(fjx1,tx);
760 fjy1 = _mm256_add_ps(fjy1,ty);
761 fjz1 = _mm256_add_ps(fjz1,tz);
763 /**************************
764 * CALCULATE INTERACTIONS *
765 **************************/
767 r22 = _mm256_mul_ps(rsq22,rinv22);
769 /* Calculate table index by multiplying r with table scale and truncate to integer */
770 rt = _mm256_mul_ps(r22,vftabscale);
771 vfitab = _mm256_cvttps_epi32(rt);
772 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
773 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
774 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
775 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
776 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
777 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
779 /* CUBIC SPLINE TABLE ELECTROSTATICS */
780 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
781 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
782 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
783 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
784 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
785 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
786 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
787 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
788 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
789 Heps = _mm256_mul_ps(vfeps,H);
790 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
791 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
792 velec = _mm256_mul_ps(qq22,VV);
793 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
794 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
796 /* Update potential sum for this i atom from the interaction with this j atom. */
797 velecsum = _mm256_add_ps(velecsum,velec);
801 /* Calculate temporary vectorial force */
802 tx = _mm256_mul_ps(fscal,dx22);
803 ty = _mm256_mul_ps(fscal,dy22);
804 tz = _mm256_mul_ps(fscal,dz22);
806 /* Update vectorial force */
807 fix2 = _mm256_add_ps(fix2,tx);
808 fiy2 = _mm256_add_ps(fiy2,ty);
809 fiz2 = _mm256_add_ps(fiz2,tz);
811 fjx2 = _mm256_add_ps(fjx2,tx);
812 fjy2 = _mm256_add_ps(fjy2,ty);
813 fjz2 = _mm256_add_ps(fjz2,tz);
815 fjptrA = f+j_coord_offsetA;
816 fjptrB = f+j_coord_offsetB;
817 fjptrC = f+j_coord_offsetC;
818 fjptrD = f+j_coord_offsetD;
819 fjptrE = f+j_coord_offsetE;
820 fjptrF = f+j_coord_offsetF;
821 fjptrG = f+j_coord_offsetG;
822 fjptrH = f+j_coord_offsetH;
824 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
825 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
827 /* Inner loop uses 417 flops */
833 /* Get j neighbor index, and coordinate index */
834 jnrlistA = jjnr[jidx];
835 jnrlistB = jjnr[jidx+1];
836 jnrlistC = jjnr[jidx+2];
837 jnrlistD = jjnr[jidx+3];
838 jnrlistE = jjnr[jidx+4];
839 jnrlistF = jjnr[jidx+5];
840 jnrlistG = jjnr[jidx+6];
841 jnrlistH = jjnr[jidx+7];
842 /* Sign of each element will be negative for non-real atoms.
843 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
844 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
846 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
847 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
849 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
850 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
851 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
852 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
853 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
854 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
855 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
856 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
857 j_coord_offsetA = DIM*jnrA;
858 j_coord_offsetB = DIM*jnrB;
859 j_coord_offsetC = DIM*jnrC;
860 j_coord_offsetD = DIM*jnrD;
861 j_coord_offsetE = DIM*jnrE;
862 j_coord_offsetF = DIM*jnrF;
863 j_coord_offsetG = DIM*jnrG;
864 j_coord_offsetH = DIM*jnrH;
866 /* load j atom coordinates */
867 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
868 x+j_coord_offsetC,x+j_coord_offsetD,
869 x+j_coord_offsetE,x+j_coord_offsetF,
870 x+j_coord_offsetG,x+j_coord_offsetH,
871 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
873 /* Calculate displacement vector */
874 dx00 = _mm256_sub_ps(ix0,jx0);
875 dy00 = _mm256_sub_ps(iy0,jy0);
876 dz00 = _mm256_sub_ps(iz0,jz0);
877 dx01 = _mm256_sub_ps(ix0,jx1);
878 dy01 = _mm256_sub_ps(iy0,jy1);
879 dz01 = _mm256_sub_ps(iz0,jz1);
880 dx02 = _mm256_sub_ps(ix0,jx2);
881 dy02 = _mm256_sub_ps(iy0,jy2);
882 dz02 = _mm256_sub_ps(iz0,jz2);
883 dx10 = _mm256_sub_ps(ix1,jx0);
884 dy10 = _mm256_sub_ps(iy1,jy0);
885 dz10 = _mm256_sub_ps(iz1,jz0);
886 dx11 = _mm256_sub_ps(ix1,jx1);
887 dy11 = _mm256_sub_ps(iy1,jy1);
888 dz11 = _mm256_sub_ps(iz1,jz1);
889 dx12 = _mm256_sub_ps(ix1,jx2);
890 dy12 = _mm256_sub_ps(iy1,jy2);
891 dz12 = _mm256_sub_ps(iz1,jz2);
892 dx20 = _mm256_sub_ps(ix2,jx0);
893 dy20 = _mm256_sub_ps(iy2,jy0);
894 dz20 = _mm256_sub_ps(iz2,jz0);
895 dx21 = _mm256_sub_ps(ix2,jx1);
896 dy21 = _mm256_sub_ps(iy2,jy1);
897 dz21 = _mm256_sub_ps(iz2,jz1);
898 dx22 = _mm256_sub_ps(ix2,jx2);
899 dy22 = _mm256_sub_ps(iy2,jy2);
900 dz22 = _mm256_sub_ps(iz2,jz2);
902 /* Calculate squared distance and things based on it */
903 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
904 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
905 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
906 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
907 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
908 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
909 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
910 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
911 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
913 rinv00 = avx256_invsqrt_f(rsq00);
914 rinv01 = avx256_invsqrt_f(rsq01);
915 rinv02 = avx256_invsqrt_f(rsq02);
916 rinv10 = avx256_invsqrt_f(rsq10);
917 rinv11 = avx256_invsqrt_f(rsq11);
918 rinv12 = avx256_invsqrt_f(rsq12);
919 rinv20 = avx256_invsqrt_f(rsq20);
920 rinv21 = avx256_invsqrt_f(rsq21);
921 rinv22 = avx256_invsqrt_f(rsq22);
923 fjx0 = _mm256_setzero_ps();
924 fjy0 = _mm256_setzero_ps();
925 fjz0 = _mm256_setzero_ps();
926 fjx1 = _mm256_setzero_ps();
927 fjy1 = _mm256_setzero_ps();
928 fjz1 = _mm256_setzero_ps();
929 fjx2 = _mm256_setzero_ps();
930 fjy2 = _mm256_setzero_ps();
931 fjz2 = _mm256_setzero_ps();
933 /**************************
934 * CALCULATE INTERACTIONS *
935 **************************/
937 r00 = _mm256_mul_ps(rsq00,rinv00);
938 r00 = _mm256_andnot_ps(dummy_mask,r00);
940 /* Calculate table index by multiplying r with table scale and truncate to integer */
941 rt = _mm256_mul_ps(r00,vftabscale);
942 vfitab = _mm256_cvttps_epi32(rt);
943 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
944 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
945 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
946 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
947 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
948 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
950 /* CUBIC SPLINE TABLE ELECTROSTATICS */
951 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
952 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
953 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
954 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
955 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
956 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
957 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
958 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
959 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
960 Heps = _mm256_mul_ps(vfeps,H);
961 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
962 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
963 velec = _mm256_mul_ps(qq00,VV);
964 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
965 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
967 /* CUBIC SPLINE TABLE DISPERSION */
968 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
969 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
970 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
971 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
972 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
973 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
974 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
975 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
976 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
977 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
978 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
979 Heps = _mm256_mul_ps(vfeps,H);
980 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
981 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
982 vvdw6 = _mm256_mul_ps(c6_00,VV);
983 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
984 fvdw6 = _mm256_mul_ps(c6_00,FF);
986 /* CUBIC SPLINE TABLE REPULSION */
987 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
988 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
989 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
990 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
991 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
992 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
993 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
994 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
995 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
996 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
997 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
998 Heps = _mm256_mul_ps(vfeps,H);
999 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1000 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1001 vvdw12 = _mm256_mul_ps(c12_00,VV);
1002 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1003 fvdw12 = _mm256_mul_ps(c12_00,FF);
1004 vvdw = _mm256_add_ps(vvdw12,vvdw6);
1005 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1007 /* Update potential sum for this i atom from the interaction with this j atom. */
1008 velec = _mm256_andnot_ps(dummy_mask,velec);
1009 velecsum = _mm256_add_ps(velecsum,velec);
1010 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
1011 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
1013 fscal = _mm256_add_ps(felec,fvdw);
1015 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1017 /* Calculate temporary vectorial force */
1018 tx = _mm256_mul_ps(fscal,dx00);
1019 ty = _mm256_mul_ps(fscal,dy00);
1020 tz = _mm256_mul_ps(fscal,dz00);
1022 /* Update vectorial force */
1023 fix0 = _mm256_add_ps(fix0,tx);
1024 fiy0 = _mm256_add_ps(fiy0,ty);
1025 fiz0 = _mm256_add_ps(fiz0,tz);
1027 fjx0 = _mm256_add_ps(fjx0,tx);
1028 fjy0 = _mm256_add_ps(fjy0,ty);
1029 fjz0 = _mm256_add_ps(fjz0,tz);
1031 /**************************
1032 * CALCULATE INTERACTIONS *
1033 **************************/
1035 r01 = _mm256_mul_ps(rsq01,rinv01);
1036 r01 = _mm256_andnot_ps(dummy_mask,r01);
1038 /* Calculate table index by multiplying r with table scale and truncate to integer */
1039 rt = _mm256_mul_ps(r01,vftabscale);
1040 vfitab = _mm256_cvttps_epi32(rt);
1041 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1042 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1043 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1044 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1045 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1046 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1048 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1049 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1050 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1051 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1052 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1053 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1054 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1055 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1056 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1057 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1058 Heps = _mm256_mul_ps(vfeps,H);
1059 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1060 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1061 velec = _mm256_mul_ps(qq01,VV);
1062 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1063 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1065 /* Update potential sum for this i atom from the interaction with this j atom. */
1066 velec = _mm256_andnot_ps(dummy_mask,velec);
1067 velecsum = _mm256_add_ps(velecsum,velec);
1071 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1073 /* Calculate temporary vectorial force */
1074 tx = _mm256_mul_ps(fscal,dx01);
1075 ty = _mm256_mul_ps(fscal,dy01);
1076 tz = _mm256_mul_ps(fscal,dz01);
1078 /* Update vectorial force */
1079 fix0 = _mm256_add_ps(fix0,tx);
1080 fiy0 = _mm256_add_ps(fiy0,ty);
1081 fiz0 = _mm256_add_ps(fiz0,tz);
1083 fjx1 = _mm256_add_ps(fjx1,tx);
1084 fjy1 = _mm256_add_ps(fjy1,ty);
1085 fjz1 = _mm256_add_ps(fjz1,tz);
1087 /**************************
1088 * CALCULATE INTERACTIONS *
1089 **************************/
1091 r02 = _mm256_mul_ps(rsq02,rinv02);
1092 r02 = _mm256_andnot_ps(dummy_mask,r02);
1094 /* Calculate table index by multiplying r with table scale and truncate to integer */
1095 rt = _mm256_mul_ps(r02,vftabscale);
1096 vfitab = _mm256_cvttps_epi32(rt);
1097 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1098 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1099 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1100 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1101 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1102 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1104 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1105 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1106 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1107 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1108 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1109 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1110 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1111 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1112 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1113 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1114 Heps = _mm256_mul_ps(vfeps,H);
1115 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1116 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1117 velec = _mm256_mul_ps(qq02,VV);
1118 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1119 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1121 /* Update potential sum for this i atom from the interaction with this j atom. */
1122 velec = _mm256_andnot_ps(dummy_mask,velec);
1123 velecsum = _mm256_add_ps(velecsum,velec);
1127 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1129 /* Calculate temporary vectorial force */
1130 tx = _mm256_mul_ps(fscal,dx02);
1131 ty = _mm256_mul_ps(fscal,dy02);
1132 tz = _mm256_mul_ps(fscal,dz02);
1134 /* Update vectorial force */
1135 fix0 = _mm256_add_ps(fix0,tx);
1136 fiy0 = _mm256_add_ps(fiy0,ty);
1137 fiz0 = _mm256_add_ps(fiz0,tz);
1139 fjx2 = _mm256_add_ps(fjx2,tx);
1140 fjy2 = _mm256_add_ps(fjy2,ty);
1141 fjz2 = _mm256_add_ps(fjz2,tz);
1143 /**************************
1144 * CALCULATE INTERACTIONS *
1145 **************************/
1147 r10 = _mm256_mul_ps(rsq10,rinv10);
1148 r10 = _mm256_andnot_ps(dummy_mask,r10);
1150 /* Calculate table index by multiplying r with table scale and truncate to integer */
1151 rt = _mm256_mul_ps(r10,vftabscale);
1152 vfitab = _mm256_cvttps_epi32(rt);
1153 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1154 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1155 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1156 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1157 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1158 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1160 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1161 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1162 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1163 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1164 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1165 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1166 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1167 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1168 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1169 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1170 Heps = _mm256_mul_ps(vfeps,H);
1171 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1172 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1173 velec = _mm256_mul_ps(qq10,VV);
1174 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1175 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1177 /* Update potential sum for this i atom from the interaction with this j atom. */
1178 velec = _mm256_andnot_ps(dummy_mask,velec);
1179 velecsum = _mm256_add_ps(velecsum,velec);
1183 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1185 /* Calculate temporary vectorial force */
1186 tx = _mm256_mul_ps(fscal,dx10);
1187 ty = _mm256_mul_ps(fscal,dy10);
1188 tz = _mm256_mul_ps(fscal,dz10);
1190 /* Update vectorial force */
1191 fix1 = _mm256_add_ps(fix1,tx);
1192 fiy1 = _mm256_add_ps(fiy1,ty);
1193 fiz1 = _mm256_add_ps(fiz1,tz);
1195 fjx0 = _mm256_add_ps(fjx0,tx);
1196 fjy0 = _mm256_add_ps(fjy0,ty);
1197 fjz0 = _mm256_add_ps(fjz0,tz);
1199 /**************************
1200 * CALCULATE INTERACTIONS *
1201 **************************/
1203 r11 = _mm256_mul_ps(rsq11,rinv11);
1204 r11 = _mm256_andnot_ps(dummy_mask,r11);
1206 /* Calculate table index by multiplying r with table scale and truncate to integer */
1207 rt = _mm256_mul_ps(r11,vftabscale);
1208 vfitab = _mm256_cvttps_epi32(rt);
1209 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1210 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1211 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1212 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1213 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1214 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1216 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1217 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1218 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1219 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1220 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1221 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1222 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1223 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1224 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1225 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1226 Heps = _mm256_mul_ps(vfeps,H);
1227 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1228 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1229 velec = _mm256_mul_ps(qq11,VV);
1230 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1231 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1233 /* Update potential sum for this i atom from the interaction with this j atom. */
1234 velec = _mm256_andnot_ps(dummy_mask,velec);
1235 velecsum = _mm256_add_ps(velecsum,velec);
1239 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1241 /* Calculate temporary vectorial force */
1242 tx = _mm256_mul_ps(fscal,dx11);
1243 ty = _mm256_mul_ps(fscal,dy11);
1244 tz = _mm256_mul_ps(fscal,dz11);
1246 /* Update vectorial force */
1247 fix1 = _mm256_add_ps(fix1,tx);
1248 fiy1 = _mm256_add_ps(fiy1,ty);
1249 fiz1 = _mm256_add_ps(fiz1,tz);
1251 fjx1 = _mm256_add_ps(fjx1,tx);
1252 fjy1 = _mm256_add_ps(fjy1,ty);
1253 fjz1 = _mm256_add_ps(fjz1,tz);
1255 /**************************
1256 * CALCULATE INTERACTIONS *
1257 **************************/
1259 r12 = _mm256_mul_ps(rsq12,rinv12);
1260 r12 = _mm256_andnot_ps(dummy_mask,r12);
1262 /* Calculate table index by multiplying r with table scale and truncate to integer */
1263 rt = _mm256_mul_ps(r12,vftabscale);
1264 vfitab = _mm256_cvttps_epi32(rt);
1265 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1266 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1267 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1268 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1269 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1270 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1272 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1273 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1274 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1275 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1276 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1277 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1278 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1279 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1280 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1281 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1282 Heps = _mm256_mul_ps(vfeps,H);
1283 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1284 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1285 velec = _mm256_mul_ps(qq12,VV);
1286 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1287 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1289 /* Update potential sum for this i atom from the interaction with this j atom. */
1290 velec = _mm256_andnot_ps(dummy_mask,velec);
1291 velecsum = _mm256_add_ps(velecsum,velec);
1295 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1297 /* Calculate temporary vectorial force */
1298 tx = _mm256_mul_ps(fscal,dx12);
1299 ty = _mm256_mul_ps(fscal,dy12);
1300 tz = _mm256_mul_ps(fscal,dz12);
1302 /* Update vectorial force */
1303 fix1 = _mm256_add_ps(fix1,tx);
1304 fiy1 = _mm256_add_ps(fiy1,ty);
1305 fiz1 = _mm256_add_ps(fiz1,tz);
1307 fjx2 = _mm256_add_ps(fjx2,tx);
1308 fjy2 = _mm256_add_ps(fjy2,ty);
1309 fjz2 = _mm256_add_ps(fjz2,tz);
1311 /**************************
1312 * CALCULATE INTERACTIONS *
1313 **************************/
1315 r20 = _mm256_mul_ps(rsq20,rinv20);
1316 r20 = _mm256_andnot_ps(dummy_mask,r20);
1318 /* Calculate table index by multiplying r with table scale and truncate to integer */
1319 rt = _mm256_mul_ps(r20,vftabscale);
1320 vfitab = _mm256_cvttps_epi32(rt);
1321 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1322 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1323 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1324 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1325 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1326 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1328 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1329 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1330 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1331 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1332 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1333 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1334 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1335 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1336 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1337 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1338 Heps = _mm256_mul_ps(vfeps,H);
1339 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1340 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1341 velec = _mm256_mul_ps(qq20,VV);
1342 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1343 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1345 /* Update potential sum for this i atom from the interaction with this j atom. */
1346 velec = _mm256_andnot_ps(dummy_mask,velec);
1347 velecsum = _mm256_add_ps(velecsum,velec);
1351 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1353 /* Calculate temporary vectorial force */
1354 tx = _mm256_mul_ps(fscal,dx20);
1355 ty = _mm256_mul_ps(fscal,dy20);
1356 tz = _mm256_mul_ps(fscal,dz20);
1358 /* Update vectorial force */
1359 fix2 = _mm256_add_ps(fix2,tx);
1360 fiy2 = _mm256_add_ps(fiy2,ty);
1361 fiz2 = _mm256_add_ps(fiz2,tz);
1363 fjx0 = _mm256_add_ps(fjx0,tx);
1364 fjy0 = _mm256_add_ps(fjy0,ty);
1365 fjz0 = _mm256_add_ps(fjz0,tz);
1367 /**************************
1368 * CALCULATE INTERACTIONS *
1369 **************************/
1371 r21 = _mm256_mul_ps(rsq21,rinv21);
1372 r21 = _mm256_andnot_ps(dummy_mask,r21);
1374 /* Calculate table index by multiplying r with table scale and truncate to integer */
1375 rt = _mm256_mul_ps(r21,vftabscale);
1376 vfitab = _mm256_cvttps_epi32(rt);
1377 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1378 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1379 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1380 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1381 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1382 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1384 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1385 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1386 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1387 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1388 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1389 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1390 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1391 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1392 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1393 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1394 Heps = _mm256_mul_ps(vfeps,H);
1395 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1396 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1397 velec = _mm256_mul_ps(qq21,VV);
1398 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1399 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1401 /* Update potential sum for this i atom from the interaction with this j atom. */
1402 velec = _mm256_andnot_ps(dummy_mask,velec);
1403 velecsum = _mm256_add_ps(velecsum,velec);
1407 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1409 /* Calculate temporary vectorial force */
1410 tx = _mm256_mul_ps(fscal,dx21);
1411 ty = _mm256_mul_ps(fscal,dy21);
1412 tz = _mm256_mul_ps(fscal,dz21);
1414 /* Update vectorial force */
1415 fix2 = _mm256_add_ps(fix2,tx);
1416 fiy2 = _mm256_add_ps(fiy2,ty);
1417 fiz2 = _mm256_add_ps(fiz2,tz);
1419 fjx1 = _mm256_add_ps(fjx1,tx);
1420 fjy1 = _mm256_add_ps(fjy1,ty);
1421 fjz1 = _mm256_add_ps(fjz1,tz);
1423 /**************************
1424 * CALCULATE INTERACTIONS *
1425 **************************/
1427 r22 = _mm256_mul_ps(rsq22,rinv22);
1428 r22 = _mm256_andnot_ps(dummy_mask,r22);
1430 /* Calculate table index by multiplying r with table scale and truncate to integer */
1431 rt = _mm256_mul_ps(r22,vftabscale);
1432 vfitab = _mm256_cvttps_epi32(rt);
1433 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1434 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1435 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1436 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1437 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1438 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1440 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1441 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1442 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1443 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1444 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1445 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1446 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1447 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1448 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1449 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1450 Heps = _mm256_mul_ps(vfeps,H);
1451 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1452 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1453 velec = _mm256_mul_ps(qq22,VV);
1454 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1455 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1457 /* Update potential sum for this i atom from the interaction with this j atom. */
1458 velec = _mm256_andnot_ps(dummy_mask,velec);
1459 velecsum = _mm256_add_ps(velecsum,velec);
1463 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1465 /* Calculate temporary vectorial force */
1466 tx = _mm256_mul_ps(fscal,dx22);
1467 ty = _mm256_mul_ps(fscal,dy22);
1468 tz = _mm256_mul_ps(fscal,dz22);
1470 /* Update vectorial force */
1471 fix2 = _mm256_add_ps(fix2,tx);
1472 fiy2 = _mm256_add_ps(fiy2,ty);
1473 fiz2 = _mm256_add_ps(fiz2,tz);
1475 fjx2 = _mm256_add_ps(fjx2,tx);
1476 fjy2 = _mm256_add_ps(fjy2,ty);
1477 fjz2 = _mm256_add_ps(fjz2,tz);
1479 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1480 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1481 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1482 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1483 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1484 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1485 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1486 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1488 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1489 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1491 /* Inner loop uses 426 flops */
1494 /* End of innermost loop */
1496 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1497 f+i_coord_offset,fshift+i_shift_offset);
1500 /* Update potential energies */
1501 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1502 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1504 /* Increment number of inner iterations */
1505 inneriter += j_index_end - j_index_start;
1507 /* Outer loop uses 20 flops */
1510 /* Increment number of outer iterations */
1513 /* Update outer/inner flops */
1515 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
1518 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single
1519 * Electrostatics interaction: CubicSplineTable
1520 * VdW interaction: CubicSplineTable
1521 * Geometry: Water3-Water3
1522 * Calculate force/pot: Force
1525 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single
1526 (t_nblist * gmx_restrict nlist,
1527 rvec * gmx_restrict xx,
1528 rvec * gmx_restrict ff,
1529 struct t_forcerec * gmx_restrict fr,
1530 t_mdatoms * gmx_restrict mdatoms,
1531 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1532 t_nrnb * gmx_restrict nrnb)
1534 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1535 * just 0 for non-waters.
1536 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1537 * jnr indices corresponding to data put in the four positions in the SIMD register.
1539 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1540 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1541 int jnrA,jnrB,jnrC,jnrD;
1542 int jnrE,jnrF,jnrG,jnrH;
1543 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1544 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1545 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1546 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1547 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1548 real rcutoff_scalar;
1549 real *shiftvec,*fshift,*x,*f;
1550 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1551 real scratch[4*DIM];
1552 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1553 real * vdwioffsetptr0;
1554 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1555 real * vdwioffsetptr1;
1556 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1557 real * vdwioffsetptr2;
1558 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1559 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1560 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1561 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1562 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1563 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1564 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1565 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1566 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1567 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1568 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1569 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1570 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1571 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1572 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1573 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1574 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1577 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1580 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1581 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1583 __m128i vfitab_lo,vfitab_hi;
1584 __m128i ifour = _mm_set1_epi32(4);
1585 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1587 __m256 dummy_mask,cutoff_mask;
1588 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1589 __m256 one = _mm256_set1_ps(1.0);
1590 __m256 two = _mm256_set1_ps(2.0);
1596 jindex = nlist->jindex;
1598 shiftidx = nlist->shift;
1600 shiftvec = fr->shift_vec[0];
1601 fshift = fr->fshift[0];
1602 facel = _mm256_set1_ps(fr->ic->epsfac);
1603 charge = mdatoms->chargeA;
1604 nvdwtype = fr->ntype;
1605 vdwparam = fr->nbfp;
1606 vdwtype = mdatoms->typeA;
1608 vftab = kernel_data->table_elec_vdw->data;
1609 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
1611 /* Setup water-specific parameters */
1612 inr = nlist->iinr[0];
1613 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1614 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1615 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1616 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1618 jq0 = _mm256_set1_ps(charge[inr+0]);
1619 jq1 = _mm256_set1_ps(charge[inr+1]);
1620 jq2 = _mm256_set1_ps(charge[inr+2]);
1621 vdwjidx0A = 2*vdwtype[inr+0];
1622 qq00 = _mm256_mul_ps(iq0,jq0);
1623 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1624 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1625 qq01 = _mm256_mul_ps(iq0,jq1);
1626 qq02 = _mm256_mul_ps(iq0,jq2);
1627 qq10 = _mm256_mul_ps(iq1,jq0);
1628 qq11 = _mm256_mul_ps(iq1,jq1);
1629 qq12 = _mm256_mul_ps(iq1,jq2);
1630 qq20 = _mm256_mul_ps(iq2,jq0);
1631 qq21 = _mm256_mul_ps(iq2,jq1);
1632 qq22 = _mm256_mul_ps(iq2,jq2);
1634 /* Avoid stupid compiler warnings */
1635 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1636 j_coord_offsetA = 0;
1637 j_coord_offsetB = 0;
1638 j_coord_offsetC = 0;
1639 j_coord_offsetD = 0;
1640 j_coord_offsetE = 0;
1641 j_coord_offsetF = 0;
1642 j_coord_offsetG = 0;
1643 j_coord_offsetH = 0;
1648 for(iidx=0;iidx<4*DIM;iidx++)
1650 scratch[iidx] = 0.0;
1653 /* Start outer loop over neighborlists */
1654 for(iidx=0; iidx<nri; iidx++)
1656 /* Load shift vector for this list */
1657 i_shift_offset = DIM*shiftidx[iidx];
1659 /* Load limits for loop over neighbors */
1660 j_index_start = jindex[iidx];
1661 j_index_end = jindex[iidx+1];
1663 /* Get outer coordinate index */
1665 i_coord_offset = DIM*inr;
1667 /* Load i particle coords and add shift vector */
1668 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1669 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1671 fix0 = _mm256_setzero_ps();
1672 fiy0 = _mm256_setzero_ps();
1673 fiz0 = _mm256_setzero_ps();
1674 fix1 = _mm256_setzero_ps();
1675 fiy1 = _mm256_setzero_ps();
1676 fiz1 = _mm256_setzero_ps();
1677 fix2 = _mm256_setzero_ps();
1678 fiy2 = _mm256_setzero_ps();
1679 fiz2 = _mm256_setzero_ps();
1681 /* Start inner kernel loop */
1682 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1685 /* Get j neighbor index, and coordinate index */
1687 jnrB = jjnr[jidx+1];
1688 jnrC = jjnr[jidx+2];
1689 jnrD = jjnr[jidx+3];
1690 jnrE = jjnr[jidx+4];
1691 jnrF = jjnr[jidx+5];
1692 jnrG = jjnr[jidx+6];
1693 jnrH = jjnr[jidx+7];
1694 j_coord_offsetA = DIM*jnrA;
1695 j_coord_offsetB = DIM*jnrB;
1696 j_coord_offsetC = DIM*jnrC;
1697 j_coord_offsetD = DIM*jnrD;
1698 j_coord_offsetE = DIM*jnrE;
1699 j_coord_offsetF = DIM*jnrF;
1700 j_coord_offsetG = DIM*jnrG;
1701 j_coord_offsetH = DIM*jnrH;
1703 /* load j atom coordinates */
1704 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1705 x+j_coord_offsetC,x+j_coord_offsetD,
1706 x+j_coord_offsetE,x+j_coord_offsetF,
1707 x+j_coord_offsetG,x+j_coord_offsetH,
1708 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1710 /* Calculate displacement vector */
1711 dx00 = _mm256_sub_ps(ix0,jx0);
1712 dy00 = _mm256_sub_ps(iy0,jy0);
1713 dz00 = _mm256_sub_ps(iz0,jz0);
1714 dx01 = _mm256_sub_ps(ix0,jx1);
1715 dy01 = _mm256_sub_ps(iy0,jy1);
1716 dz01 = _mm256_sub_ps(iz0,jz1);
1717 dx02 = _mm256_sub_ps(ix0,jx2);
1718 dy02 = _mm256_sub_ps(iy0,jy2);
1719 dz02 = _mm256_sub_ps(iz0,jz2);
1720 dx10 = _mm256_sub_ps(ix1,jx0);
1721 dy10 = _mm256_sub_ps(iy1,jy0);
1722 dz10 = _mm256_sub_ps(iz1,jz0);
1723 dx11 = _mm256_sub_ps(ix1,jx1);
1724 dy11 = _mm256_sub_ps(iy1,jy1);
1725 dz11 = _mm256_sub_ps(iz1,jz1);
1726 dx12 = _mm256_sub_ps(ix1,jx2);
1727 dy12 = _mm256_sub_ps(iy1,jy2);
1728 dz12 = _mm256_sub_ps(iz1,jz2);
1729 dx20 = _mm256_sub_ps(ix2,jx0);
1730 dy20 = _mm256_sub_ps(iy2,jy0);
1731 dz20 = _mm256_sub_ps(iz2,jz0);
1732 dx21 = _mm256_sub_ps(ix2,jx1);
1733 dy21 = _mm256_sub_ps(iy2,jy1);
1734 dz21 = _mm256_sub_ps(iz2,jz1);
1735 dx22 = _mm256_sub_ps(ix2,jx2);
1736 dy22 = _mm256_sub_ps(iy2,jy2);
1737 dz22 = _mm256_sub_ps(iz2,jz2);
1739 /* Calculate squared distance and things based on it */
1740 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1741 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1742 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1743 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1744 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1745 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1746 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1747 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1748 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1750 rinv00 = avx256_invsqrt_f(rsq00);
1751 rinv01 = avx256_invsqrt_f(rsq01);
1752 rinv02 = avx256_invsqrt_f(rsq02);
1753 rinv10 = avx256_invsqrt_f(rsq10);
1754 rinv11 = avx256_invsqrt_f(rsq11);
1755 rinv12 = avx256_invsqrt_f(rsq12);
1756 rinv20 = avx256_invsqrt_f(rsq20);
1757 rinv21 = avx256_invsqrt_f(rsq21);
1758 rinv22 = avx256_invsqrt_f(rsq22);
1760 fjx0 = _mm256_setzero_ps();
1761 fjy0 = _mm256_setzero_ps();
1762 fjz0 = _mm256_setzero_ps();
1763 fjx1 = _mm256_setzero_ps();
1764 fjy1 = _mm256_setzero_ps();
1765 fjz1 = _mm256_setzero_ps();
1766 fjx2 = _mm256_setzero_ps();
1767 fjy2 = _mm256_setzero_ps();
1768 fjz2 = _mm256_setzero_ps();
1770 /**************************
1771 * CALCULATE INTERACTIONS *
1772 **************************/
1774 r00 = _mm256_mul_ps(rsq00,rinv00);
1776 /* Calculate table index by multiplying r with table scale and truncate to integer */
1777 rt = _mm256_mul_ps(r00,vftabscale);
1778 vfitab = _mm256_cvttps_epi32(rt);
1779 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1780 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1781 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1782 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1783 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1784 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1786 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1787 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1788 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1789 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1790 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1791 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1792 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1793 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1794 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1795 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1796 Heps = _mm256_mul_ps(vfeps,H);
1797 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1798 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1799 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1801 /* CUBIC SPLINE TABLE DISPERSION */
1802 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1803 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1804 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1805 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1806 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1807 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1808 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1809 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1810 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1811 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1812 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1813 Heps = _mm256_mul_ps(vfeps,H);
1814 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1815 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1816 fvdw6 = _mm256_mul_ps(c6_00,FF);
1818 /* CUBIC SPLINE TABLE REPULSION */
1819 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1820 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1821 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1822 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1823 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1824 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1825 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1826 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1827 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1828 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1829 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1830 Heps = _mm256_mul_ps(vfeps,H);
1831 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1832 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1833 fvdw12 = _mm256_mul_ps(c12_00,FF);
1834 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1836 fscal = _mm256_add_ps(felec,fvdw);
1838 /* Calculate temporary vectorial force */
1839 tx = _mm256_mul_ps(fscal,dx00);
1840 ty = _mm256_mul_ps(fscal,dy00);
1841 tz = _mm256_mul_ps(fscal,dz00);
1843 /* Update vectorial force */
1844 fix0 = _mm256_add_ps(fix0,tx);
1845 fiy0 = _mm256_add_ps(fiy0,ty);
1846 fiz0 = _mm256_add_ps(fiz0,tz);
1848 fjx0 = _mm256_add_ps(fjx0,tx);
1849 fjy0 = _mm256_add_ps(fjy0,ty);
1850 fjz0 = _mm256_add_ps(fjz0,tz);
1852 /**************************
1853 * CALCULATE INTERACTIONS *
1854 **************************/
1856 r01 = _mm256_mul_ps(rsq01,rinv01);
1858 /* Calculate table index by multiplying r with table scale and truncate to integer */
1859 rt = _mm256_mul_ps(r01,vftabscale);
1860 vfitab = _mm256_cvttps_epi32(rt);
1861 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1862 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1863 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1864 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1865 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1866 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1868 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1869 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1870 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1871 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1872 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1873 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1874 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1875 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1876 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1877 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1878 Heps = _mm256_mul_ps(vfeps,H);
1879 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1880 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1881 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1885 /* Calculate temporary vectorial force */
1886 tx = _mm256_mul_ps(fscal,dx01);
1887 ty = _mm256_mul_ps(fscal,dy01);
1888 tz = _mm256_mul_ps(fscal,dz01);
1890 /* Update vectorial force */
1891 fix0 = _mm256_add_ps(fix0,tx);
1892 fiy0 = _mm256_add_ps(fiy0,ty);
1893 fiz0 = _mm256_add_ps(fiz0,tz);
1895 fjx1 = _mm256_add_ps(fjx1,tx);
1896 fjy1 = _mm256_add_ps(fjy1,ty);
1897 fjz1 = _mm256_add_ps(fjz1,tz);
1899 /**************************
1900 * CALCULATE INTERACTIONS *
1901 **************************/
1903 r02 = _mm256_mul_ps(rsq02,rinv02);
1905 /* Calculate table index by multiplying r with table scale and truncate to integer */
1906 rt = _mm256_mul_ps(r02,vftabscale);
1907 vfitab = _mm256_cvttps_epi32(rt);
1908 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1909 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1910 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1911 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1912 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1913 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1915 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1916 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1917 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1918 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1919 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1920 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1921 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1922 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1923 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1924 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1925 Heps = _mm256_mul_ps(vfeps,H);
1926 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1927 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1928 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1932 /* Calculate temporary vectorial force */
1933 tx = _mm256_mul_ps(fscal,dx02);
1934 ty = _mm256_mul_ps(fscal,dy02);
1935 tz = _mm256_mul_ps(fscal,dz02);
1937 /* Update vectorial force */
1938 fix0 = _mm256_add_ps(fix0,tx);
1939 fiy0 = _mm256_add_ps(fiy0,ty);
1940 fiz0 = _mm256_add_ps(fiz0,tz);
1942 fjx2 = _mm256_add_ps(fjx2,tx);
1943 fjy2 = _mm256_add_ps(fjy2,ty);
1944 fjz2 = _mm256_add_ps(fjz2,tz);
1946 /**************************
1947 * CALCULATE INTERACTIONS *
1948 **************************/
1950 r10 = _mm256_mul_ps(rsq10,rinv10);
1952 /* Calculate table index by multiplying r with table scale and truncate to integer */
1953 rt = _mm256_mul_ps(r10,vftabscale);
1954 vfitab = _mm256_cvttps_epi32(rt);
1955 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1956 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1957 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1958 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1959 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1960 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1962 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1963 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1964 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1965 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1966 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1967 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1968 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1969 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1970 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1971 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1972 Heps = _mm256_mul_ps(vfeps,H);
1973 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1974 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1975 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1979 /* Calculate temporary vectorial force */
1980 tx = _mm256_mul_ps(fscal,dx10);
1981 ty = _mm256_mul_ps(fscal,dy10);
1982 tz = _mm256_mul_ps(fscal,dz10);
1984 /* Update vectorial force */
1985 fix1 = _mm256_add_ps(fix1,tx);
1986 fiy1 = _mm256_add_ps(fiy1,ty);
1987 fiz1 = _mm256_add_ps(fiz1,tz);
1989 fjx0 = _mm256_add_ps(fjx0,tx);
1990 fjy0 = _mm256_add_ps(fjy0,ty);
1991 fjz0 = _mm256_add_ps(fjz0,tz);
1993 /**************************
1994 * CALCULATE INTERACTIONS *
1995 **************************/
1997 r11 = _mm256_mul_ps(rsq11,rinv11);
1999 /* Calculate table index by multiplying r with table scale and truncate to integer */
2000 rt = _mm256_mul_ps(r11,vftabscale);
2001 vfitab = _mm256_cvttps_epi32(rt);
2002 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2003 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2004 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2005 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2006 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2007 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2009 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2010 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2011 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2012 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2013 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2014 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2015 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2016 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2017 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2018 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2019 Heps = _mm256_mul_ps(vfeps,H);
2020 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2021 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2022 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2026 /* Calculate temporary vectorial force */
2027 tx = _mm256_mul_ps(fscal,dx11);
2028 ty = _mm256_mul_ps(fscal,dy11);
2029 tz = _mm256_mul_ps(fscal,dz11);
2031 /* Update vectorial force */
2032 fix1 = _mm256_add_ps(fix1,tx);
2033 fiy1 = _mm256_add_ps(fiy1,ty);
2034 fiz1 = _mm256_add_ps(fiz1,tz);
2036 fjx1 = _mm256_add_ps(fjx1,tx);
2037 fjy1 = _mm256_add_ps(fjy1,ty);
2038 fjz1 = _mm256_add_ps(fjz1,tz);
2040 /**************************
2041 * CALCULATE INTERACTIONS *
2042 **************************/
2044 r12 = _mm256_mul_ps(rsq12,rinv12);
2046 /* Calculate table index by multiplying r with table scale and truncate to integer */
2047 rt = _mm256_mul_ps(r12,vftabscale);
2048 vfitab = _mm256_cvttps_epi32(rt);
2049 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2050 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2051 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2052 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2053 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2054 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2056 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2057 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2058 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2059 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2060 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2061 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2062 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2063 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2064 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2065 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2066 Heps = _mm256_mul_ps(vfeps,H);
2067 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2068 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2069 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2073 /* Calculate temporary vectorial force */
2074 tx = _mm256_mul_ps(fscal,dx12);
2075 ty = _mm256_mul_ps(fscal,dy12);
2076 tz = _mm256_mul_ps(fscal,dz12);
2078 /* Update vectorial force */
2079 fix1 = _mm256_add_ps(fix1,tx);
2080 fiy1 = _mm256_add_ps(fiy1,ty);
2081 fiz1 = _mm256_add_ps(fiz1,tz);
2083 fjx2 = _mm256_add_ps(fjx2,tx);
2084 fjy2 = _mm256_add_ps(fjy2,ty);
2085 fjz2 = _mm256_add_ps(fjz2,tz);
2087 /**************************
2088 * CALCULATE INTERACTIONS *
2089 **************************/
2091 r20 = _mm256_mul_ps(rsq20,rinv20);
2093 /* Calculate table index by multiplying r with table scale and truncate to integer */
2094 rt = _mm256_mul_ps(r20,vftabscale);
2095 vfitab = _mm256_cvttps_epi32(rt);
2096 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2097 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2098 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2099 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2100 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2101 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2103 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2104 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2105 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2106 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2107 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2108 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2109 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2110 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2111 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2112 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2113 Heps = _mm256_mul_ps(vfeps,H);
2114 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2115 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2116 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2120 /* Calculate temporary vectorial force */
2121 tx = _mm256_mul_ps(fscal,dx20);
2122 ty = _mm256_mul_ps(fscal,dy20);
2123 tz = _mm256_mul_ps(fscal,dz20);
2125 /* Update vectorial force */
2126 fix2 = _mm256_add_ps(fix2,tx);
2127 fiy2 = _mm256_add_ps(fiy2,ty);
2128 fiz2 = _mm256_add_ps(fiz2,tz);
2130 fjx0 = _mm256_add_ps(fjx0,tx);
2131 fjy0 = _mm256_add_ps(fjy0,ty);
2132 fjz0 = _mm256_add_ps(fjz0,tz);
2134 /**************************
2135 * CALCULATE INTERACTIONS *
2136 **************************/
2138 r21 = _mm256_mul_ps(rsq21,rinv21);
2140 /* Calculate table index by multiplying r with table scale and truncate to integer */
2141 rt = _mm256_mul_ps(r21,vftabscale);
2142 vfitab = _mm256_cvttps_epi32(rt);
2143 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2144 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2145 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2146 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2147 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2148 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2150 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2151 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2152 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2153 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2154 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2155 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2156 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2157 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2158 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2159 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2160 Heps = _mm256_mul_ps(vfeps,H);
2161 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2162 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2163 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2167 /* Calculate temporary vectorial force */
2168 tx = _mm256_mul_ps(fscal,dx21);
2169 ty = _mm256_mul_ps(fscal,dy21);
2170 tz = _mm256_mul_ps(fscal,dz21);
2172 /* Update vectorial force */
2173 fix2 = _mm256_add_ps(fix2,tx);
2174 fiy2 = _mm256_add_ps(fiy2,ty);
2175 fiz2 = _mm256_add_ps(fiz2,tz);
2177 fjx1 = _mm256_add_ps(fjx1,tx);
2178 fjy1 = _mm256_add_ps(fjy1,ty);
2179 fjz1 = _mm256_add_ps(fjz1,tz);
2181 /**************************
2182 * CALCULATE INTERACTIONS *
2183 **************************/
2185 r22 = _mm256_mul_ps(rsq22,rinv22);
2187 /* Calculate table index by multiplying r with table scale and truncate to integer */
2188 rt = _mm256_mul_ps(r22,vftabscale);
2189 vfitab = _mm256_cvttps_epi32(rt);
2190 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2191 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2192 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2193 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2194 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2195 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2197 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2198 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2199 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2200 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2201 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2202 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2203 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2204 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2205 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2206 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2207 Heps = _mm256_mul_ps(vfeps,H);
2208 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2209 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2210 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2214 /* Calculate temporary vectorial force */
2215 tx = _mm256_mul_ps(fscal,dx22);
2216 ty = _mm256_mul_ps(fscal,dy22);
2217 tz = _mm256_mul_ps(fscal,dz22);
2219 /* Update vectorial force */
2220 fix2 = _mm256_add_ps(fix2,tx);
2221 fiy2 = _mm256_add_ps(fiy2,ty);
2222 fiz2 = _mm256_add_ps(fiz2,tz);
2224 fjx2 = _mm256_add_ps(fjx2,tx);
2225 fjy2 = _mm256_add_ps(fjy2,ty);
2226 fjz2 = _mm256_add_ps(fjz2,tz);
2228 fjptrA = f+j_coord_offsetA;
2229 fjptrB = f+j_coord_offsetB;
2230 fjptrC = f+j_coord_offsetC;
2231 fjptrD = f+j_coord_offsetD;
2232 fjptrE = f+j_coord_offsetE;
2233 fjptrF = f+j_coord_offsetF;
2234 fjptrG = f+j_coord_offsetG;
2235 fjptrH = f+j_coord_offsetH;
2237 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2238 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2240 /* Inner loop uses 373 flops */
2243 if(jidx<j_index_end)
2246 /* Get j neighbor index, and coordinate index */
2247 jnrlistA = jjnr[jidx];
2248 jnrlistB = jjnr[jidx+1];
2249 jnrlistC = jjnr[jidx+2];
2250 jnrlistD = jjnr[jidx+3];
2251 jnrlistE = jjnr[jidx+4];
2252 jnrlistF = jjnr[jidx+5];
2253 jnrlistG = jjnr[jidx+6];
2254 jnrlistH = jjnr[jidx+7];
2255 /* Sign of each element will be negative for non-real atoms.
2256 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2257 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2259 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2260 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2262 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2263 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2264 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2265 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2266 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2267 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2268 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2269 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2270 j_coord_offsetA = DIM*jnrA;
2271 j_coord_offsetB = DIM*jnrB;
2272 j_coord_offsetC = DIM*jnrC;
2273 j_coord_offsetD = DIM*jnrD;
2274 j_coord_offsetE = DIM*jnrE;
2275 j_coord_offsetF = DIM*jnrF;
2276 j_coord_offsetG = DIM*jnrG;
2277 j_coord_offsetH = DIM*jnrH;
2279 /* load j atom coordinates */
2280 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2281 x+j_coord_offsetC,x+j_coord_offsetD,
2282 x+j_coord_offsetE,x+j_coord_offsetF,
2283 x+j_coord_offsetG,x+j_coord_offsetH,
2284 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2286 /* Calculate displacement vector */
2287 dx00 = _mm256_sub_ps(ix0,jx0);
2288 dy00 = _mm256_sub_ps(iy0,jy0);
2289 dz00 = _mm256_sub_ps(iz0,jz0);
2290 dx01 = _mm256_sub_ps(ix0,jx1);
2291 dy01 = _mm256_sub_ps(iy0,jy1);
2292 dz01 = _mm256_sub_ps(iz0,jz1);
2293 dx02 = _mm256_sub_ps(ix0,jx2);
2294 dy02 = _mm256_sub_ps(iy0,jy2);
2295 dz02 = _mm256_sub_ps(iz0,jz2);
2296 dx10 = _mm256_sub_ps(ix1,jx0);
2297 dy10 = _mm256_sub_ps(iy1,jy0);
2298 dz10 = _mm256_sub_ps(iz1,jz0);
2299 dx11 = _mm256_sub_ps(ix1,jx1);
2300 dy11 = _mm256_sub_ps(iy1,jy1);
2301 dz11 = _mm256_sub_ps(iz1,jz1);
2302 dx12 = _mm256_sub_ps(ix1,jx2);
2303 dy12 = _mm256_sub_ps(iy1,jy2);
2304 dz12 = _mm256_sub_ps(iz1,jz2);
2305 dx20 = _mm256_sub_ps(ix2,jx0);
2306 dy20 = _mm256_sub_ps(iy2,jy0);
2307 dz20 = _mm256_sub_ps(iz2,jz0);
2308 dx21 = _mm256_sub_ps(ix2,jx1);
2309 dy21 = _mm256_sub_ps(iy2,jy1);
2310 dz21 = _mm256_sub_ps(iz2,jz1);
2311 dx22 = _mm256_sub_ps(ix2,jx2);
2312 dy22 = _mm256_sub_ps(iy2,jy2);
2313 dz22 = _mm256_sub_ps(iz2,jz2);
2315 /* Calculate squared distance and things based on it */
2316 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2317 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2318 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2319 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2320 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2321 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2322 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2323 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2324 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2326 rinv00 = avx256_invsqrt_f(rsq00);
2327 rinv01 = avx256_invsqrt_f(rsq01);
2328 rinv02 = avx256_invsqrt_f(rsq02);
2329 rinv10 = avx256_invsqrt_f(rsq10);
2330 rinv11 = avx256_invsqrt_f(rsq11);
2331 rinv12 = avx256_invsqrt_f(rsq12);
2332 rinv20 = avx256_invsqrt_f(rsq20);
2333 rinv21 = avx256_invsqrt_f(rsq21);
2334 rinv22 = avx256_invsqrt_f(rsq22);
2336 fjx0 = _mm256_setzero_ps();
2337 fjy0 = _mm256_setzero_ps();
2338 fjz0 = _mm256_setzero_ps();
2339 fjx1 = _mm256_setzero_ps();
2340 fjy1 = _mm256_setzero_ps();
2341 fjz1 = _mm256_setzero_ps();
2342 fjx2 = _mm256_setzero_ps();
2343 fjy2 = _mm256_setzero_ps();
2344 fjz2 = _mm256_setzero_ps();
2346 /**************************
2347 * CALCULATE INTERACTIONS *
2348 **************************/
2350 r00 = _mm256_mul_ps(rsq00,rinv00);
2351 r00 = _mm256_andnot_ps(dummy_mask,r00);
2353 /* Calculate table index by multiplying r with table scale and truncate to integer */
2354 rt = _mm256_mul_ps(r00,vftabscale);
2355 vfitab = _mm256_cvttps_epi32(rt);
2356 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2357 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2358 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2359 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2360 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2361 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2363 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2364 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2365 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2366 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2367 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2368 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2369 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2370 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2371 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2372 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2373 Heps = _mm256_mul_ps(vfeps,H);
2374 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2375 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2376 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2378 /* CUBIC SPLINE TABLE DISPERSION */
2379 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2380 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2381 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2382 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2383 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2384 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2385 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2386 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2387 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2388 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2389 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2390 Heps = _mm256_mul_ps(vfeps,H);
2391 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2392 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2393 fvdw6 = _mm256_mul_ps(c6_00,FF);
2395 /* CUBIC SPLINE TABLE REPULSION */
2396 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2397 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2398 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2399 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2400 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2401 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2402 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2403 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2404 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2405 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2406 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2407 Heps = _mm256_mul_ps(vfeps,H);
2408 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2409 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2410 fvdw12 = _mm256_mul_ps(c12_00,FF);
2411 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
2413 fscal = _mm256_add_ps(felec,fvdw);
2415 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2417 /* Calculate temporary vectorial force */
2418 tx = _mm256_mul_ps(fscal,dx00);
2419 ty = _mm256_mul_ps(fscal,dy00);
2420 tz = _mm256_mul_ps(fscal,dz00);
2422 /* Update vectorial force */
2423 fix0 = _mm256_add_ps(fix0,tx);
2424 fiy0 = _mm256_add_ps(fiy0,ty);
2425 fiz0 = _mm256_add_ps(fiz0,tz);
2427 fjx0 = _mm256_add_ps(fjx0,tx);
2428 fjy0 = _mm256_add_ps(fjy0,ty);
2429 fjz0 = _mm256_add_ps(fjz0,tz);
2431 /**************************
2432 * CALCULATE INTERACTIONS *
2433 **************************/
2435 r01 = _mm256_mul_ps(rsq01,rinv01);
2436 r01 = _mm256_andnot_ps(dummy_mask,r01);
2438 /* Calculate table index by multiplying r with table scale and truncate to integer */
2439 rt = _mm256_mul_ps(r01,vftabscale);
2440 vfitab = _mm256_cvttps_epi32(rt);
2441 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2442 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2443 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2444 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2445 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2446 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2448 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2449 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2450 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2451 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2452 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2453 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2454 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2455 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2456 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2457 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2458 Heps = _mm256_mul_ps(vfeps,H);
2459 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2460 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2461 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2465 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2467 /* Calculate temporary vectorial force */
2468 tx = _mm256_mul_ps(fscal,dx01);
2469 ty = _mm256_mul_ps(fscal,dy01);
2470 tz = _mm256_mul_ps(fscal,dz01);
2472 /* Update vectorial force */
2473 fix0 = _mm256_add_ps(fix0,tx);
2474 fiy0 = _mm256_add_ps(fiy0,ty);
2475 fiz0 = _mm256_add_ps(fiz0,tz);
2477 fjx1 = _mm256_add_ps(fjx1,tx);
2478 fjy1 = _mm256_add_ps(fjy1,ty);
2479 fjz1 = _mm256_add_ps(fjz1,tz);
2481 /**************************
2482 * CALCULATE INTERACTIONS *
2483 **************************/
2485 r02 = _mm256_mul_ps(rsq02,rinv02);
2486 r02 = _mm256_andnot_ps(dummy_mask,r02);
2488 /* Calculate table index by multiplying r with table scale and truncate to integer */
2489 rt = _mm256_mul_ps(r02,vftabscale);
2490 vfitab = _mm256_cvttps_epi32(rt);
2491 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2492 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2493 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2494 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2495 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2496 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2498 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2499 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2500 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2501 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2502 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2503 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2504 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2505 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2506 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2507 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2508 Heps = _mm256_mul_ps(vfeps,H);
2509 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2510 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2511 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2515 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2517 /* Calculate temporary vectorial force */
2518 tx = _mm256_mul_ps(fscal,dx02);
2519 ty = _mm256_mul_ps(fscal,dy02);
2520 tz = _mm256_mul_ps(fscal,dz02);
2522 /* Update vectorial force */
2523 fix0 = _mm256_add_ps(fix0,tx);
2524 fiy0 = _mm256_add_ps(fiy0,ty);
2525 fiz0 = _mm256_add_ps(fiz0,tz);
2527 fjx2 = _mm256_add_ps(fjx2,tx);
2528 fjy2 = _mm256_add_ps(fjy2,ty);
2529 fjz2 = _mm256_add_ps(fjz2,tz);
2531 /**************************
2532 * CALCULATE INTERACTIONS *
2533 **************************/
2535 r10 = _mm256_mul_ps(rsq10,rinv10);
2536 r10 = _mm256_andnot_ps(dummy_mask,r10);
2538 /* Calculate table index by multiplying r with table scale and truncate to integer */
2539 rt = _mm256_mul_ps(r10,vftabscale);
2540 vfitab = _mm256_cvttps_epi32(rt);
2541 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2542 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2543 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2544 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2545 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2546 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2548 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2549 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2550 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2551 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2552 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2553 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2554 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2555 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2556 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2557 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2558 Heps = _mm256_mul_ps(vfeps,H);
2559 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2560 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2561 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2565 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2567 /* Calculate temporary vectorial force */
2568 tx = _mm256_mul_ps(fscal,dx10);
2569 ty = _mm256_mul_ps(fscal,dy10);
2570 tz = _mm256_mul_ps(fscal,dz10);
2572 /* Update vectorial force */
2573 fix1 = _mm256_add_ps(fix1,tx);
2574 fiy1 = _mm256_add_ps(fiy1,ty);
2575 fiz1 = _mm256_add_ps(fiz1,tz);
2577 fjx0 = _mm256_add_ps(fjx0,tx);
2578 fjy0 = _mm256_add_ps(fjy0,ty);
2579 fjz0 = _mm256_add_ps(fjz0,tz);
2581 /**************************
2582 * CALCULATE INTERACTIONS *
2583 **************************/
2585 r11 = _mm256_mul_ps(rsq11,rinv11);
2586 r11 = _mm256_andnot_ps(dummy_mask,r11);
2588 /* Calculate table index by multiplying r with table scale and truncate to integer */
2589 rt = _mm256_mul_ps(r11,vftabscale);
2590 vfitab = _mm256_cvttps_epi32(rt);
2591 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2592 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2593 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2594 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2595 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2596 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2598 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2599 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2600 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2601 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2602 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2603 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2604 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2605 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2606 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2607 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2608 Heps = _mm256_mul_ps(vfeps,H);
2609 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2610 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2611 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2615 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2617 /* Calculate temporary vectorial force */
2618 tx = _mm256_mul_ps(fscal,dx11);
2619 ty = _mm256_mul_ps(fscal,dy11);
2620 tz = _mm256_mul_ps(fscal,dz11);
2622 /* Update vectorial force */
2623 fix1 = _mm256_add_ps(fix1,tx);
2624 fiy1 = _mm256_add_ps(fiy1,ty);
2625 fiz1 = _mm256_add_ps(fiz1,tz);
2627 fjx1 = _mm256_add_ps(fjx1,tx);
2628 fjy1 = _mm256_add_ps(fjy1,ty);
2629 fjz1 = _mm256_add_ps(fjz1,tz);
2631 /**************************
2632 * CALCULATE INTERACTIONS *
2633 **************************/
2635 r12 = _mm256_mul_ps(rsq12,rinv12);
2636 r12 = _mm256_andnot_ps(dummy_mask,r12);
2638 /* Calculate table index by multiplying r with table scale and truncate to integer */
2639 rt = _mm256_mul_ps(r12,vftabscale);
2640 vfitab = _mm256_cvttps_epi32(rt);
2641 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2642 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2643 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2644 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2645 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2646 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2648 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2649 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2650 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2651 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2652 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2653 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2654 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2655 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2656 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2657 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2658 Heps = _mm256_mul_ps(vfeps,H);
2659 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2660 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2661 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2665 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2667 /* Calculate temporary vectorial force */
2668 tx = _mm256_mul_ps(fscal,dx12);
2669 ty = _mm256_mul_ps(fscal,dy12);
2670 tz = _mm256_mul_ps(fscal,dz12);
2672 /* Update vectorial force */
2673 fix1 = _mm256_add_ps(fix1,tx);
2674 fiy1 = _mm256_add_ps(fiy1,ty);
2675 fiz1 = _mm256_add_ps(fiz1,tz);
2677 fjx2 = _mm256_add_ps(fjx2,tx);
2678 fjy2 = _mm256_add_ps(fjy2,ty);
2679 fjz2 = _mm256_add_ps(fjz2,tz);
2681 /**************************
2682 * CALCULATE INTERACTIONS *
2683 **************************/
2685 r20 = _mm256_mul_ps(rsq20,rinv20);
2686 r20 = _mm256_andnot_ps(dummy_mask,r20);
2688 /* Calculate table index by multiplying r with table scale and truncate to integer */
2689 rt = _mm256_mul_ps(r20,vftabscale);
2690 vfitab = _mm256_cvttps_epi32(rt);
2691 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2692 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2693 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2694 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2695 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2696 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2698 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2699 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2700 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2701 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2702 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2703 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2704 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2705 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2706 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2707 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2708 Heps = _mm256_mul_ps(vfeps,H);
2709 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2710 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2711 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2715 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2717 /* Calculate temporary vectorial force */
2718 tx = _mm256_mul_ps(fscal,dx20);
2719 ty = _mm256_mul_ps(fscal,dy20);
2720 tz = _mm256_mul_ps(fscal,dz20);
2722 /* Update vectorial force */
2723 fix2 = _mm256_add_ps(fix2,tx);
2724 fiy2 = _mm256_add_ps(fiy2,ty);
2725 fiz2 = _mm256_add_ps(fiz2,tz);
2727 fjx0 = _mm256_add_ps(fjx0,tx);
2728 fjy0 = _mm256_add_ps(fjy0,ty);
2729 fjz0 = _mm256_add_ps(fjz0,tz);
2731 /**************************
2732 * CALCULATE INTERACTIONS *
2733 **************************/
2735 r21 = _mm256_mul_ps(rsq21,rinv21);
2736 r21 = _mm256_andnot_ps(dummy_mask,r21);
2738 /* Calculate table index by multiplying r with table scale and truncate to integer */
2739 rt = _mm256_mul_ps(r21,vftabscale);
2740 vfitab = _mm256_cvttps_epi32(rt);
2741 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2742 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2743 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2744 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2745 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2746 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2748 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2749 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2750 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2751 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2752 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2753 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2754 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2755 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2756 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2757 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2758 Heps = _mm256_mul_ps(vfeps,H);
2759 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2760 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2761 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2765 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2767 /* Calculate temporary vectorial force */
2768 tx = _mm256_mul_ps(fscal,dx21);
2769 ty = _mm256_mul_ps(fscal,dy21);
2770 tz = _mm256_mul_ps(fscal,dz21);
2772 /* Update vectorial force */
2773 fix2 = _mm256_add_ps(fix2,tx);
2774 fiy2 = _mm256_add_ps(fiy2,ty);
2775 fiz2 = _mm256_add_ps(fiz2,tz);
2777 fjx1 = _mm256_add_ps(fjx1,tx);
2778 fjy1 = _mm256_add_ps(fjy1,ty);
2779 fjz1 = _mm256_add_ps(fjz1,tz);
2781 /**************************
2782 * CALCULATE INTERACTIONS *
2783 **************************/
2785 r22 = _mm256_mul_ps(rsq22,rinv22);
2786 r22 = _mm256_andnot_ps(dummy_mask,r22);
2788 /* Calculate table index by multiplying r with table scale and truncate to integer */
2789 rt = _mm256_mul_ps(r22,vftabscale);
2790 vfitab = _mm256_cvttps_epi32(rt);
2791 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2792 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2793 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2794 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2795 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2796 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2798 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2799 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2800 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2801 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2802 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2803 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2804 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2805 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2806 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2807 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2808 Heps = _mm256_mul_ps(vfeps,H);
2809 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2810 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2811 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2815 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2817 /* Calculate temporary vectorial force */
2818 tx = _mm256_mul_ps(fscal,dx22);
2819 ty = _mm256_mul_ps(fscal,dy22);
2820 tz = _mm256_mul_ps(fscal,dz22);
2822 /* Update vectorial force */
2823 fix2 = _mm256_add_ps(fix2,tx);
2824 fiy2 = _mm256_add_ps(fiy2,ty);
2825 fiz2 = _mm256_add_ps(fiz2,tz);
2827 fjx2 = _mm256_add_ps(fjx2,tx);
2828 fjy2 = _mm256_add_ps(fjy2,ty);
2829 fjz2 = _mm256_add_ps(fjz2,tz);
2831 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2832 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2833 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2834 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2835 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2836 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2837 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2838 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2840 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2841 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2843 /* Inner loop uses 382 flops */
2846 /* End of innermost loop */
2848 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2849 f+i_coord_offset,fshift+i_shift_offset);
2851 /* Increment number of inner iterations */
2852 inneriter += j_index_end - j_index_start;
2854 /* Outer loop uses 18 flops */
2857 /* Increment number of outer iterations */
2860 /* Update outer/inner flops */
2862 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);