2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: CubicSplineTable
56 * Geometry: Water3-Water3
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr0;
89 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90 real * vdwioffsetptr1;
91 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92 real * vdwioffsetptr2;
93 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
95 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
96 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
97 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
98 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
99 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
100 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
101 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
102 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
103 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
104 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
105 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
106 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
107 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
108 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
109 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
116 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
118 __m128i vfitab_lo,vfitab_hi;
119 __m128i ifour = _mm_set1_epi32(4);
120 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
122 __m256 dummy_mask,cutoff_mask;
123 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
124 __m256 one = _mm256_set1_ps(1.0);
125 __m256 two = _mm256_set1_ps(2.0);
131 jindex = nlist->jindex;
133 shiftidx = nlist->shift;
135 shiftvec = fr->shift_vec[0];
136 fshift = fr->fshift[0];
137 facel = _mm256_set1_ps(fr->epsfac);
138 charge = mdatoms->chargeA;
139 nvdwtype = fr->ntype;
141 vdwtype = mdatoms->typeA;
143 vftab = kernel_data->table_elec_vdw->data;
144 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
146 /* Setup water-specific parameters */
147 inr = nlist->iinr[0];
148 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
149 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
150 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
151 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
153 jq0 = _mm256_set1_ps(charge[inr+0]);
154 jq1 = _mm256_set1_ps(charge[inr+1]);
155 jq2 = _mm256_set1_ps(charge[inr+2]);
156 vdwjidx0A = 2*vdwtype[inr+0];
157 qq00 = _mm256_mul_ps(iq0,jq0);
158 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
159 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
160 qq01 = _mm256_mul_ps(iq0,jq1);
161 qq02 = _mm256_mul_ps(iq0,jq2);
162 qq10 = _mm256_mul_ps(iq1,jq0);
163 qq11 = _mm256_mul_ps(iq1,jq1);
164 qq12 = _mm256_mul_ps(iq1,jq2);
165 qq20 = _mm256_mul_ps(iq2,jq0);
166 qq21 = _mm256_mul_ps(iq2,jq1);
167 qq22 = _mm256_mul_ps(iq2,jq2);
169 /* Avoid stupid compiler warnings */
170 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
183 for(iidx=0;iidx<4*DIM;iidx++)
188 /* Start outer loop over neighborlists */
189 for(iidx=0; iidx<nri; iidx++)
191 /* Load shift vector for this list */
192 i_shift_offset = DIM*shiftidx[iidx];
194 /* Load limits for loop over neighbors */
195 j_index_start = jindex[iidx];
196 j_index_end = jindex[iidx+1];
198 /* Get outer coordinate index */
200 i_coord_offset = DIM*inr;
202 /* Load i particle coords and add shift vector */
203 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
204 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
206 fix0 = _mm256_setzero_ps();
207 fiy0 = _mm256_setzero_ps();
208 fiz0 = _mm256_setzero_ps();
209 fix1 = _mm256_setzero_ps();
210 fiy1 = _mm256_setzero_ps();
211 fiz1 = _mm256_setzero_ps();
212 fix2 = _mm256_setzero_ps();
213 fiy2 = _mm256_setzero_ps();
214 fiz2 = _mm256_setzero_ps();
216 /* Reset potential sums */
217 velecsum = _mm256_setzero_ps();
218 vvdwsum = _mm256_setzero_ps();
220 /* Start inner kernel loop */
221 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
224 /* Get j neighbor index, and coordinate index */
233 j_coord_offsetA = DIM*jnrA;
234 j_coord_offsetB = DIM*jnrB;
235 j_coord_offsetC = DIM*jnrC;
236 j_coord_offsetD = DIM*jnrD;
237 j_coord_offsetE = DIM*jnrE;
238 j_coord_offsetF = DIM*jnrF;
239 j_coord_offsetG = DIM*jnrG;
240 j_coord_offsetH = DIM*jnrH;
242 /* load j atom coordinates */
243 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
244 x+j_coord_offsetC,x+j_coord_offsetD,
245 x+j_coord_offsetE,x+j_coord_offsetF,
246 x+j_coord_offsetG,x+j_coord_offsetH,
247 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
249 /* Calculate displacement vector */
250 dx00 = _mm256_sub_ps(ix0,jx0);
251 dy00 = _mm256_sub_ps(iy0,jy0);
252 dz00 = _mm256_sub_ps(iz0,jz0);
253 dx01 = _mm256_sub_ps(ix0,jx1);
254 dy01 = _mm256_sub_ps(iy0,jy1);
255 dz01 = _mm256_sub_ps(iz0,jz1);
256 dx02 = _mm256_sub_ps(ix0,jx2);
257 dy02 = _mm256_sub_ps(iy0,jy2);
258 dz02 = _mm256_sub_ps(iz0,jz2);
259 dx10 = _mm256_sub_ps(ix1,jx0);
260 dy10 = _mm256_sub_ps(iy1,jy0);
261 dz10 = _mm256_sub_ps(iz1,jz0);
262 dx11 = _mm256_sub_ps(ix1,jx1);
263 dy11 = _mm256_sub_ps(iy1,jy1);
264 dz11 = _mm256_sub_ps(iz1,jz1);
265 dx12 = _mm256_sub_ps(ix1,jx2);
266 dy12 = _mm256_sub_ps(iy1,jy2);
267 dz12 = _mm256_sub_ps(iz1,jz2);
268 dx20 = _mm256_sub_ps(ix2,jx0);
269 dy20 = _mm256_sub_ps(iy2,jy0);
270 dz20 = _mm256_sub_ps(iz2,jz0);
271 dx21 = _mm256_sub_ps(ix2,jx1);
272 dy21 = _mm256_sub_ps(iy2,jy1);
273 dz21 = _mm256_sub_ps(iz2,jz1);
274 dx22 = _mm256_sub_ps(ix2,jx2);
275 dy22 = _mm256_sub_ps(iy2,jy2);
276 dz22 = _mm256_sub_ps(iz2,jz2);
278 /* Calculate squared distance and things based on it */
279 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
280 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
281 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
282 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
283 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
284 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
285 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
286 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
287 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
289 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
290 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
291 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
292 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
293 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
294 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
295 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
296 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
297 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
299 fjx0 = _mm256_setzero_ps();
300 fjy0 = _mm256_setzero_ps();
301 fjz0 = _mm256_setzero_ps();
302 fjx1 = _mm256_setzero_ps();
303 fjy1 = _mm256_setzero_ps();
304 fjz1 = _mm256_setzero_ps();
305 fjx2 = _mm256_setzero_ps();
306 fjy2 = _mm256_setzero_ps();
307 fjz2 = _mm256_setzero_ps();
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 r00 = _mm256_mul_ps(rsq00,rinv00);
315 /* Calculate table index by multiplying r with table scale and truncate to integer */
316 rt = _mm256_mul_ps(r00,vftabscale);
317 vfitab = _mm256_cvttps_epi32(rt);
318 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
319 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
320 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
321 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
322 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
323 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
325 /* CUBIC SPLINE TABLE ELECTROSTATICS */
326 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
327 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
328 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
329 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
330 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
331 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
332 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
333 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
334 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
335 Heps = _mm256_mul_ps(vfeps,H);
336 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
337 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
338 velec = _mm256_mul_ps(qq00,VV);
339 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
340 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
342 /* CUBIC SPLINE TABLE DISPERSION */
343 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
344 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
345 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
346 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
347 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
348 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
349 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
350 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
351 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
352 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
353 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
354 Heps = _mm256_mul_ps(vfeps,H);
355 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
356 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
357 vvdw6 = _mm256_mul_ps(c6_00,VV);
358 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
359 fvdw6 = _mm256_mul_ps(c6_00,FF);
361 /* CUBIC SPLINE TABLE REPULSION */
362 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
363 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
364 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
365 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
366 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
367 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
368 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
369 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
370 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
371 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
372 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
373 Heps = _mm256_mul_ps(vfeps,H);
374 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
375 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
376 vvdw12 = _mm256_mul_ps(c12_00,VV);
377 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
378 fvdw12 = _mm256_mul_ps(c12_00,FF);
379 vvdw = _mm256_add_ps(vvdw12,vvdw6);
380 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
382 /* Update potential sum for this i atom from the interaction with this j atom. */
383 velecsum = _mm256_add_ps(velecsum,velec);
384 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
386 fscal = _mm256_add_ps(felec,fvdw);
388 /* Calculate temporary vectorial force */
389 tx = _mm256_mul_ps(fscal,dx00);
390 ty = _mm256_mul_ps(fscal,dy00);
391 tz = _mm256_mul_ps(fscal,dz00);
393 /* Update vectorial force */
394 fix0 = _mm256_add_ps(fix0,tx);
395 fiy0 = _mm256_add_ps(fiy0,ty);
396 fiz0 = _mm256_add_ps(fiz0,tz);
398 fjx0 = _mm256_add_ps(fjx0,tx);
399 fjy0 = _mm256_add_ps(fjy0,ty);
400 fjz0 = _mm256_add_ps(fjz0,tz);
402 /**************************
403 * CALCULATE INTERACTIONS *
404 **************************/
406 r01 = _mm256_mul_ps(rsq01,rinv01);
408 /* Calculate table index by multiplying r with table scale and truncate to integer */
409 rt = _mm256_mul_ps(r01,vftabscale);
410 vfitab = _mm256_cvttps_epi32(rt);
411 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
412 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
413 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
414 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
415 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
416 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
418 /* CUBIC SPLINE TABLE ELECTROSTATICS */
419 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
420 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
421 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
422 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
423 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
424 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
425 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
426 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
427 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
428 Heps = _mm256_mul_ps(vfeps,H);
429 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
430 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
431 velec = _mm256_mul_ps(qq01,VV);
432 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
433 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
435 /* Update potential sum for this i atom from the interaction with this j atom. */
436 velecsum = _mm256_add_ps(velecsum,velec);
440 /* Calculate temporary vectorial force */
441 tx = _mm256_mul_ps(fscal,dx01);
442 ty = _mm256_mul_ps(fscal,dy01);
443 tz = _mm256_mul_ps(fscal,dz01);
445 /* Update vectorial force */
446 fix0 = _mm256_add_ps(fix0,tx);
447 fiy0 = _mm256_add_ps(fiy0,ty);
448 fiz0 = _mm256_add_ps(fiz0,tz);
450 fjx1 = _mm256_add_ps(fjx1,tx);
451 fjy1 = _mm256_add_ps(fjy1,ty);
452 fjz1 = _mm256_add_ps(fjz1,tz);
454 /**************************
455 * CALCULATE INTERACTIONS *
456 **************************/
458 r02 = _mm256_mul_ps(rsq02,rinv02);
460 /* Calculate table index by multiplying r with table scale and truncate to integer */
461 rt = _mm256_mul_ps(r02,vftabscale);
462 vfitab = _mm256_cvttps_epi32(rt);
463 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
464 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
465 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
466 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
467 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
468 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
470 /* CUBIC SPLINE TABLE ELECTROSTATICS */
471 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
472 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
473 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
474 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
475 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
476 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
477 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
478 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
479 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
480 Heps = _mm256_mul_ps(vfeps,H);
481 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
482 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
483 velec = _mm256_mul_ps(qq02,VV);
484 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
485 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
487 /* Update potential sum for this i atom from the interaction with this j atom. */
488 velecsum = _mm256_add_ps(velecsum,velec);
492 /* Calculate temporary vectorial force */
493 tx = _mm256_mul_ps(fscal,dx02);
494 ty = _mm256_mul_ps(fscal,dy02);
495 tz = _mm256_mul_ps(fscal,dz02);
497 /* Update vectorial force */
498 fix0 = _mm256_add_ps(fix0,tx);
499 fiy0 = _mm256_add_ps(fiy0,ty);
500 fiz0 = _mm256_add_ps(fiz0,tz);
502 fjx2 = _mm256_add_ps(fjx2,tx);
503 fjy2 = _mm256_add_ps(fjy2,ty);
504 fjz2 = _mm256_add_ps(fjz2,tz);
506 /**************************
507 * CALCULATE INTERACTIONS *
508 **************************/
510 r10 = _mm256_mul_ps(rsq10,rinv10);
512 /* Calculate table index by multiplying r with table scale and truncate to integer */
513 rt = _mm256_mul_ps(r10,vftabscale);
514 vfitab = _mm256_cvttps_epi32(rt);
515 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
516 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
517 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
518 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
519 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
520 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
522 /* CUBIC SPLINE TABLE ELECTROSTATICS */
523 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
524 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
525 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
526 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
527 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
528 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
529 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
530 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
531 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
532 Heps = _mm256_mul_ps(vfeps,H);
533 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
534 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
535 velec = _mm256_mul_ps(qq10,VV);
536 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
537 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
539 /* Update potential sum for this i atom from the interaction with this j atom. */
540 velecsum = _mm256_add_ps(velecsum,velec);
544 /* Calculate temporary vectorial force */
545 tx = _mm256_mul_ps(fscal,dx10);
546 ty = _mm256_mul_ps(fscal,dy10);
547 tz = _mm256_mul_ps(fscal,dz10);
549 /* Update vectorial force */
550 fix1 = _mm256_add_ps(fix1,tx);
551 fiy1 = _mm256_add_ps(fiy1,ty);
552 fiz1 = _mm256_add_ps(fiz1,tz);
554 fjx0 = _mm256_add_ps(fjx0,tx);
555 fjy0 = _mm256_add_ps(fjy0,ty);
556 fjz0 = _mm256_add_ps(fjz0,tz);
558 /**************************
559 * CALCULATE INTERACTIONS *
560 **************************/
562 r11 = _mm256_mul_ps(rsq11,rinv11);
564 /* Calculate table index by multiplying r with table scale and truncate to integer */
565 rt = _mm256_mul_ps(r11,vftabscale);
566 vfitab = _mm256_cvttps_epi32(rt);
567 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
568 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
569 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
570 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
571 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
572 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
574 /* CUBIC SPLINE TABLE ELECTROSTATICS */
575 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
576 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
577 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
578 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
579 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
580 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
581 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
582 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
583 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
584 Heps = _mm256_mul_ps(vfeps,H);
585 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
586 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
587 velec = _mm256_mul_ps(qq11,VV);
588 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
589 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
591 /* Update potential sum for this i atom from the interaction with this j atom. */
592 velecsum = _mm256_add_ps(velecsum,velec);
596 /* Calculate temporary vectorial force */
597 tx = _mm256_mul_ps(fscal,dx11);
598 ty = _mm256_mul_ps(fscal,dy11);
599 tz = _mm256_mul_ps(fscal,dz11);
601 /* Update vectorial force */
602 fix1 = _mm256_add_ps(fix1,tx);
603 fiy1 = _mm256_add_ps(fiy1,ty);
604 fiz1 = _mm256_add_ps(fiz1,tz);
606 fjx1 = _mm256_add_ps(fjx1,tx);
607 fjy1 = _mm256_add_ps(fjy1,ty);
608 fjz1 = _mm256_add_ps(fjz1,tz);
610 /**************************
611 * CALCULATE INTERACTIONS *
612 **************************/
614 r12 = _mm256_mul_ps(rsq12,rinv12);
616 /* Calculate table index by multiplying r with table scale and truncate to integer */
617 rt = _mm256_mul_ps(r12,vftabscale);
618 vfitab = _mm256_cvttps_epi32(rt);
619 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
620 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
621 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
622 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
623 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
624 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
626 /* CUBIC SPLINE TABLE ELECTROSTATICS */
627 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
628 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
629 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
630 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
631 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
632 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
633 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
634 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
635 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
636 Heps = _mm256_mul_ps(vfeps,H);
637 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
638 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
639 velec = _mm256_mul_ps(qq12,VV);
640 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
641 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
643 /* Update potential sum for this i atom from the interaction with this j atom. */
644 velecsum = _mm256_add_ps(velecsum,velec);
648 /* Calculate temporary vectorial force */
649 tx = _mm256_mul_ps(fscal,dx12);
650 ty = _mm256_mul_ps(fscal,dy12);
651 tz = _mm256_mul_ps(fscal,dz12);
653 /* Update vectorial force */
654 fix1 = _mm256_add_ps(fix1,tx);
655 fiy1 = _mm256_add_ps(fiy1,ty);
656 fiz1 = _mm256_add_ps(fiz1,tz);
658 fjx2 = _mm256_add_ps(fjx2,tx);
659 fjy2 = _mm256_add_ps(fjy2,ty);
660 fjz2 = _mm256_add_ps(fjz2,tz);
662 /**************************
663 * CALCULATE INTERACTIONS *
664 **************************/
666 r20 = _mm256_mul_ps(rsq20,rinv20);
668 /* Calculate table index by multiplying r with table scale and truncate to integer */
669 rt = _mm256_mul_ps(r20,vftabscale);
670 vfitab = _mm256_cvttps_epi32(rt);
671 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
672 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
673 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
674 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
675 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
676 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
678 /* CUBIC SPLINE TABLE ELECTROSTATICS */
679 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
680 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
681 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
682 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
683 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
684 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
685 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
686 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
687 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
688 Heps = _mm256_mul_ps(vfeps,H);
689 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
690 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
691 velec = _mm256_mul_ps(qq20,VV);
692 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
693 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
695 /* Update potential sum for this i atom from the interaction with this j atom. */
696 velecsum = _mm256_add_ps(velecsum,velec);
700 /* Calculate temporary vectorial force */
701 tx = _mm256_mul_ps(fscal,dx20);
702 ty = _mm256_mul_ps(fscal,dy20);
703 tz = _mm256_mul_ps(fscal,dz20);
705 /* Update vectorial force */
706 fix2 = _mm256_add_ps(fix2,tx);
707 fiy2 = _mm256_add_ps(fiy2,ty);
708 fiz2 = _mm256_add_ps(fiz2,tz);
710 fjx0 = _mm256_add_ps(fjx0,tx);
711 fjy0 = _mm256_add_ps(fjy0,ty);
712 fjz0 = _mm256_add_ps(fjz0,tz);
714 /**************************
715 * CALCULATE INTERACTIONS *
716 **************************/
718 r21 = _mm256_mul_ps(rsq21,rinv21);
720 /* Calculate table index by multiplying r with table scale and truncate to integer */
721 rt = _mm256_mul_ps(r21,vftabscale);
722 vfitab = _mm256_cvttps_epi32(rt);
723 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
724 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
725 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
726 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
727 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
728 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
730 /* CUBIC SPLINE TABLE ELECTROSTATICS */
731 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
732 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
733 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
734 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
735 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
736 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
737 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
738 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
739 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
740 Heps = _mm256_mul_ps(vfeps,H);
741 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
742 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
743 velec = _mm256_mul_ps(qq21,VV);
744 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
745 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
747 /* Update potential sum for this i atom from the interaction with this j atom. */
748 velecsum = _mm256_add_ps(velecsum,velec);
752 /* Calculate temporary vectorial force */
753 tx = _mm256_mul_ps(fscal,dx21);
754 ty = _mm256_mul_ps(fscal,dy21);
755 tz = _mm256_mul_ps(fscal,dz21);
757 /* Update vectorial force */
758 fix2 = _mm256_add_ps(fix2,tx);
759 fiy2 = _mm256_add_ps(fiy2,ty);
760 fiz2 = _mm256_add_ps(fiz2,tz);
762 fjx1 = _mm256_add_ps(fjx1,tx);
763 fjy1 = _mm256_add_ps(fjy1,ty);
764 fjz1 = _mm256_add_ps(fjz1,tz);
766 /**************************
767 * CALCULATE INTERACTIONS *
768 **************************/
770 r22 = _mm256_mul_ps(rsq22,rinv22);
772 /* Calculate table index by multiplying r with table scale and truncate to integer */
773 rt = _mm256_mul_ps(r22,vftabscale);
774 vfitab = _mm256_cvttps_epi32(rt);
775 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
776 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
777 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
778 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
779 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
780 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
782 /* CUBIC SPLINE TABLE ELECTROSTATICS */
783 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
784 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
785 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
786 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
787 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
788 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
789 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
790 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
791 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
792 Heps = _mm256_mul_ps(vfeps,H);
793 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
794 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
795 velec = _mm256_mul_ps(qq22,VV);
796 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
797 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
799 /* Update potential sum for this i atom from the interaction with this j atom. */
800 velecsum = _mm256_add_ps(velecsum,velec);
804 /* Calculate temporary vectorial force */
805 tx = _mm256_mul_ps(fscal,dx22);
806 ty = _mm256_mul_ps(fscal,dy22);
807 tz = _mm256_mul_ps(fscal,dz22);
809 /* Update vectorial force */
810 fix2 = _mm256_add_ps(fix2,tx);
811 fiy2 = _mm256_add_ps(fiy2,ty);
812 fiz2 = _mm256_add_ps(fiz2,tz);
814 fjx2 = _mm256_add_ps(fjx2,tx);
815 fjy2 = _mm256_add_ps(fjy2,ty);
816 fjz2 = _mm256_add_ps(fjz2,tz);
818 fjptrA = f+j_coord_offsetA;
819 fjptrB = f+j_coord_offsetB;
820 fjptrC = f+j_coord_offsetC;
821 fjptrD = f+j_coord_offsetD;
822 fjptrE = f+j_coord_offsetE;
823 fjptrF = f+j_coord_offsetF;
824 fjptrG = f+j_coord_offsetG;
825 fjptrH = f+j_coord_offsetH;
827 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
828 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
830 /* Inner loop uses 417 flops */
836 /* Get j neighbor index, and coordinate index */
837 jnrlistA = jjnr[jidx];
838 jnrlistB = jjnr[jidx+1];
839 jnrlistC = jjnr[jidx+2];
840 jnrlistD = jjnr[jidx+3];
841 jnrlistE = jjnr[jidx+4];
842 jnrlistF = jjnr[jidx+5];
843 jnrlistG = jjnr[jidx+6];
844 jnrlistH = jjnr[jidx+7];
845 /* Sign of each element will be negative for non-real atoms.
846 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
847 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
849 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
850 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
852 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
853 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
854 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
855 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
856 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
857 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
858 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
859 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
860 j_coord_offsetA = DIM*jnrA;
861 j_coord_offsetB = DIM*jnrB;
862 j_coord_offsetC = DIM*jnrC;
863 j_coord_offsetD = DIM*jnrD;
864 j_coord_offsetE = DIM*jnrE;
865 j_coord_offsetF = DIM*jnrF;
866 j_coord_offsetG = DIM*jnrG;
867 j_coord_offsetH = DIM*jnrH;
869 /* load j atom coordinates */
870 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
871 x+j_coord_offsetC,x+j_coord_offsetD,
872 x+j_coord_offsetE,x+j_coord_offsetF,
873 x+j_coord_offsetG,x+j_coord_offsetH,
874 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
876 /* Calculate displacement vector */
877 dx00 = _mm256_sub_ps(ix0,jx0);
878 dy00 = _mm256_sub_ps(iy0,jy0);
879 dz00 = _mm256_sub_ps(iz0,jz0);
880 dx01 = _mm256_sub_ps(ix0,jx1);
881 dy01 = _mm256_sub_ps(iy0,jy1);
882 dz01 = _mm256_sub_ps(iz0,jz1);
883 dx02 = _mm256_sub_ps(ix0,jx2);
884 dy02 = _mm256_sub_ps(iy0,jy2);
885 dz02 = _mm256_sub_ps(iz0,jz2);
886 dx10 = _mm256_sub_ps(ix1,jx0);
887 dy10 = _mm256_sub_ps(iy1,jy0);
888 dz10 = _mm256_sub_ps(iz1,jz0);
889 dx11 = _mm256_sub_ps(ix1,jx1);
890 dy11 = _mm256_sub_ps(iy1,jy1);
891 dz11 = _mm256_sub_ps(iz1,jz1);
892 dx12 = _mm256_sub_ps(ix1,jx2);
893 dy12 = _mm256_sub_ps(iy1,jy2);
894 dz12 = _mm256_sub_ps(iz1,jz2);
895 dx20 = _mm256_sub_ps(ix2,jx0);
896 dy20 = _mm256_sub_ps(iy2,jy0);
897 dz20 = _mm256_sub_ps(iz2,jz0);
898 dx21 = _mm256_sub_ps(ix2,jx1);
899 dy21 = _mm256_sub_ps(iy2,jy1);
900 dz21 = _mm256_sub_ps(iz2,jz1);
901 dx22 = _mm256_sub_ps(ix2,jx2);
902 dy22 = _mm256_sub_ps(iy2,jy2);
903 dz22 = _mm256_sub_ps(iz2,jz2);
905 /* Calculate squared distance and things based on it */
906 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
907 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
908 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
909 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
910 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
911 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
912 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
913 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
914 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
916 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
917 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
918 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
919 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
920 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
921 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
922 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
923 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
924 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
926 fjx0 = _mm256_setzero_ps();
927 fjy0 = _mm256_setzero_ps();
928 fjz0 = _mm256_setzero_ps();
929 fjx1 = _mm256_setzero_ps();
930 fjy1 = _mm256_setzero_ps();
931 fjz1 = _mm256_setzero_ps();
932 fjx2 = _mm256_setzero_ps();
933 fjy2 = _mm256_setzero_ps();
934 fjz2 = _mm256_setzero_ps();
936 /**************************
937 * CALCULATE INTERACTIONS *
938 **************************/
940 r00 = _mm256_mul_ps(rsq00,rinv00);
941 r00 = _mm256_andnot_ps(dummy_mask,r00);
943 /* Calculate table index by multiplying r with table scale and truncate to integer */
944 rt = _mm256_mul_ps(r00,vftabscale);
945 vfitab = _mm256_cvttps_epi32(rt);
946 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
947 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
948 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
949 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
950 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
951 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
953 /* CUBIC SPLINE TABLE ELECTROSTATICS */
954 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
955 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
956 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
957 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
958 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
959 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
960 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
961 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
962 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
963 Heps = _mm256_mul_ps(vfeps,H);
964 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
965 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
966 velec = _mm256_mul_ps(qq00,VV);
967 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
968 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
970 /* CUBIC SPLINE TABLE DISPERSION */
971 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
972 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
973 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
974 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
975 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
976 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
977 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
978 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
979 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
980 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
981 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
982 Heps = _mm256_mul_ps(vfeps,H);
983 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
984 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
985 vvdw6 = _mm256_mul_ps(c6_00,VV);
986 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
987 fvdw6 = _mm256_mul_ps(c6_00,FF);
989 /* CUBIC SPLINE TABLE REPULSION */
990 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
991 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
992 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
993 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
994 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
995 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
996 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
997 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
998 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
999 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1000 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1001 Heps = _mm256_mul_ps(vfeps,H);
1002 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1003 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1004 vvdw12 = _mm256_mul_ps(c12_00,VV);
1005 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1006 fvdw12 = _mm256_mul_ps(c12_00,FF);
1007 vvdw = _mm256_add_ps(vvdw12,vvdw6);
1008 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1010 /* Update potential sum for this i atom from the interaction with this j atom. */
1011 velec = _mm256_andnot_ps(dummy_mask,velec);
1012 velecsum = _mm256_add_ps(velecsum,velec);
1013 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
1014 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
1016 fscal = _mm256_add_ps(felec,fvdw);
1018 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1020 /* Calculate temporary vectorial force */
1021 tx = _mm256_mul_ps(fscal,dx00);
1022 ty = _mm256_mul_ps(fscal,dy00);
1023 tz = _mm256_mul_ps(fscal,dz00);
1025 /* Update vectorial force */
1026 fix0 = _mm256_add_ps(fix0,tx);
1027 fiy0 = _mm256_add_ps(fiy0,ty);
1028 fiz0 = _mm256_add_ps(fiz0,tz);
1030 fjx0 = _mm256_add_ps(fjx0,tx);
1031 fjy0 = _mm256_add_ps(fjy0,ty);
1032 fjz0 = _mm256_add_ps(fjz0,tz);
1034 /**************************
1035 * CALCULATE INTERACTIONS *
1036 **************************/
1038 r01 = _mm256_mul_ps(rsq01,rinv01);
1039 r01 = _mm256_andnot_ps(dummy_mask,r01);
1041 /* Calculate table index by multiplying r with table scale and truncate to integer */
1042 rt = _mm256_mul_ps(r01,vftabscale);
1043 vfitab = _mm256_cvttps_epi32(rt);
1044 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1045 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1046 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1047 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1048 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1049 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1051 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1052 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1053 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1054 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1055 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1056 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1057 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1058 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1059 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1060 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1061 Heps = _mm256_mul_ps(vfeps,H);
1062 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1063 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1064 velec = _mm256_mul_ps(qq01,VV);
1065 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1066 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1068 /* Update potential sum for this i atom from the interaction with this j atom. */
1069 velec = _mm256_andnot_ps(dummy_mask,velec);
1070 velecsum = _mm256_add_ps(velecsum,velec);
1074 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1076 /* Calculate temporary vectorial force */
1077 tx = _mm256_mul_ps(fscal,dx01);
1078 ty = _mm256_mul_ps(fscal,dy01);
1079 tz = _mm256_mul_ps(fscal,dz01);
1081 /* Update vectorial force */
1082 fix0 = _mm256_add_ps(fix0,tx);
1083 fiy0 = _mm256_add_ps(fiy0,ty);
1084 fiz0 = _mm256_add_ps(fiz0,tz);
1086 fjx1 = _mm256_add_ps(fjx1,tx);
1087 fjy1 = _mm256_add_ps(fjy1,ty);
1088 fjz1 = _mm256_add_ps(fjz1,tz);
1090 /**************************
1091 * CALCULATE INTERACTIONS *
1092 **************************/
1094 r02 = _mm256_mul_ps(rsq02,rinv02);
1095 r02 = _mm256_andnot_ps(dummy_mask,r02);
1097 /* Calculate table index by multiplying r with table scale and truncate to integer */
1098 rt = _mm256_mul_ps(r02,vftabscale);
1099 vfitab = _mm256_cvttps_epi32(rt);
1100 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1101 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1102 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1103 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1104 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1105 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1107 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1108 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1109 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1110 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1111 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1112 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1113 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1114 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1115 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1116 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1117 Heps = _mm256_mul_ps(vfeps,H);
1118 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1119 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1120 velec = _mm256_mul_ps(qq02,VV);
1121 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1122 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1124 /* Update potential sum for this i atom from the interaction with this j atom. */
1125 velec = _mm256_andnot_ps(dummy_mask,velec);
1126 velecsum = _mm256_add_ps(velecsum,velec);
1130 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1132 /* Calculate temporary vectorial force */
1133 tx = _mm256_mul_ps(fscal,dx02);
1134 ty = _mm256_mul_ps(fscal,dy02);
1135 tz = _mm256_mul_ps(fscal,dz02);
1137 /* Update vectorial force */
1138 fix0 = _mm256_add_ps(fix0,tx);
1139 fiy0 = _mm256_add_ps(fiy0,ty);
1140 fiz0 = _mm256_add_ps(fiz0,tz);
1142 fjx2 = _mm256_add_ps(fjx2,tx);
1143 fjy2 = _mm256_add_ps(fjy2,ty);
1144 fjz2 = _mm256_add_ps(fjz2,tz);
1146 /**************************
1147 * CALCULATE INTERACTIONS *
1148 **************************/
1150 r10 = _mm256_mul_ps(rsq10,rinv10);
1151 r10 = _mm256_andnot_ps(dummy_mask,r10);
1153 /* Calculate table index by multiplying r with table scale and truncate to integer */
1154 rt = _mm256_mul_ps(r10,vftabscale);
1155 vfitab = _mm256_cvttps_epi32(rt);
1156 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1157 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1158 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1159 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1160 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1161 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1163 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1164 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1165 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1166 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1167 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1168 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1169 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1170 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1171 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1172 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1173 Heps = _mm256_mul_ps(vfeps,H);
1174 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1175 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1176 velec = _mm256_mul_ps(qq10,VV);
1177 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1178 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1180 /* Update potential sum for this i atom from the interaction with this j atom. */
1181 velec = _mm256_andnot_ps(dummy_mask,velec);
1182 velecsum = _mm256_add_ps(velecsum,velec);
1186 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1188 /* Calculate temporary vectorial force */
1189 tx = _mm256_mul_ps(fscal,dx10);
1190 ty = _mm256_mul_ps(fscal,dy10);
1191 tz = _mm256_mul_ps(fscal,dz10);
1193 /* Update vectorial force */
1194 fix1 = _mm256_add_ps(fix1,tx);
1195 fiy1 = _mm256_add_ps(fiy1,ty);
1196 fiz1 = _mm256_add_ps(fiz1,tz);
1198 fjx0 = _mm256_add_ps(fjx0,tx);
1199 fjy0 = _mm256_add_ps(fjy0,ty);
1200 fjz0 = _mm256_add_ps(fjz0,tz);
1202 /**************************
1203 * CALCULATE INTERACTIONS *
1204 **************************/
1206 r11 = _mm256_mul_ps(rsq11,rinv11);
1207 r11 = _mm256_andnot_ps(dummy_mask,r11);
1209 /* Calculate table index by multiplying r with table scale and truncate to integer */
1210 rt = _mm256_mul_ps(r11,vftabscale);
1211 vfitab = _mm256_cvttps_epi32(rt);
1212 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1213 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1214 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1215 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1216 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1217 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1219 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1220 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1221 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1222 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1223 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1224 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1225 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1226 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1227 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1228 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1229 Heps = _mm256_mul_ps(vfeps,H);
1230 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1231 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1232 velec = _mm256_mul_ps(qq11,VV);
1233 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1234 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1236 /* Update potential sum for this i atom from the interaction with this j atom. */
1237 velec = _mm256_andnot_ps(dummy_mask,velec);
1238 velecsum = _mm256_add_ps(velecsum,velec);
1242 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1244 /* Calculate temporary vectorial force */
1245 tx = _mm256_mul_ps(fscal,dx11);
1246 ty = _mm256_mul_ps(fscal,dy11);
1247 tz = _mm256_mul_ps(fscal,dz11);
1249 /* Update vectorial force */
1250 fix1 = _mm256_add_ps(fix1,tx);
1251 fiy1 = _mm256_add_ps(fiy1,ty);
1252 fiz1 = _mm256_add_ps(fiz1,tz);
1254 fjx1 = _mm256_add_ps(fjx1,tx);
1255 fjy1 = _mm256_add_ps(fjy1,ty);
1256 fjz1 = _mm256_add_ps(fjz1,tz);
1258 /**************************
1259 * CALCULATE INTERACTIONS *
1260 **************************/
1262 r12 = _mm256_mul_ps(rsq12,rinv12);
1263 r12 = _mm256_andnot_ps(dummy_mask,r12);
1265 /* Calculate table index by multiplying r with table scale and truncate to integer */
1266 rt = _mm256_mul_ps(r12,vftabscale);
1267 vfitab = _mm256_cvttps_epi32(rt);
1268 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1269 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1270 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1271 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1272 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1273 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1275 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1276 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1277 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1278 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1279 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1280 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1281 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1282 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1283 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1284 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1285 Heps = _mm256_mul_ps(vfeps,H);
1286 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1287 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1288 velec = _mm256_mul_ps(qq12,VV);
1289 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1290 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1292 /* Update potential sum for this i atom from the interaction with this j atom. */
1293 velec = _mm256_andnot_ps(dummy_mask,velec);
1294 velecsum = _mm256_add_ps(velecsum,velec);
1298 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1300 /* Calculate temporary vectorial force */
1301 tx = _mm256_mul_ps(fscal,dx12);
1302 ty = _mm256_mul_ps(fscal,dy12);
1303 tz = _mm256_mul_ps(fscal,dz12);
1305 /* Update vectorial force */
1306 fix1 = _mm256_add_ps(fix1,tx);
1307 fiy1 = _mm256_add_ps(fiy1,ty);
1308 fiz1 = _mm256_add_ps(fiz1,tz);
1310 fjx2 = _mm256_add_ps(fjx2,tx);
1311 fjy2 = _mm256_add_ps(fjy2,ty);
1312 fjz2 = _mm256_add_ps(fjz2,tz);
1314 /**************************
1315 * CALCULATE INTERACTIONS *
1316 **************************/
1318 r20 = _mm256_mul_ps(rsq20,rinv20);
1319 r20 = _mm256_andnot_ps(dummy_mask,r20);
1321 /* Calculate table index by multiplying r with table scale and truncate to integer */
1322 rt = _mm256_mul_ps(r20,vftabscale);
1323 vfitab = _mm256_cvttps_epi32(rt);
1324 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1325 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1326 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1327 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1328 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1329 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1331 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1332 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1333 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1334 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1335 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1336 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1337 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1338 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1339 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1340 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1341 Heps = _mm256_mul_ps(vfeps,H);
1342 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1343 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1344 velec = _mm256_mul_ps(qq20,VV);
1345 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1346 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1348 /* Update potential sum for this i atom from the interaction with this j atom. */
1349 velec = _mm256_andnot_ps(dummy_mask,velec);
1350 velecsum = _mm256_add_ps(velecsum,velec);
1354 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1356 /* Calculate temporary vectorial force */
1357 tx = _mm256_mul_ps(fscal,dx20);
1358 ty = _mm256_mul_ps(fscal,dy20);
1359 tz = _mm256_mul_ps(fscal,dz20);
1361 /* Update vectorial force */
1362 fix2 = _mm256_add_ps(fix2,tx);
1363 fiy2 = _mm256_add_ps(fiy2,ty);
1364 fiz2 = _mm256_add_ps(fiz2,tz);
1366 fjx0 = _mm256_add_ps(fjx0,tx);
1367 fjy0 = _mm256_add_ps(fjy0,ty);
1368 fjz0 = _mm256_add_ps(fjz0,tz);
1370 /**************************
1371 * CALCULATE INTERACTIONS *
1372 **************************/
1374 r21 = _mm256_mul_ps(rsq21,rinv21);
1375 r21 = _mm256_andnot_ps(dummy_mask,r21);
1377 /* Calculate table index by multiplying r with table scale and truncate to integer */
1378 rt = _mm256_mul_ps(r21,vftabscale);
1379 vfitab = _mm256_cvttps_epi32(rt);
1380 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1381 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1382 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1383 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1384 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1385 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1387 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1388 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1389 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1390 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1391 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1392 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1393 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1394 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1395 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1396 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1397 Heps = _mm256_mul_ps(vfeps,H);
1398 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1399 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1400 velec = _mm256_mul_ps(qq21,VV);
1401 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1402 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1404 /* Update potential sum for this i atom from the interaction with this j atom. */
1405 velec = _mm256_andnot_ps(dummy_mask,velec);
1406 velecsum = _mm256_add_ps(velecsum,velec);
1410 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1412 /* Calculate temporary vectorial force */
1413 tx = _mm256_mul_ps(fscal,dx21);
1414 ty = _mm256_mul_ps(fscal,dy21);
1415 tz = _mm256_mul_ps(fscal,dz21);
1417 /* Update vectorial force */
1418 fix2 = _mm256_add_ps(fix2,tx);
1419 fiy2 = _mm256_add_ps(fiy2,ty);
1420 fiz2 = _mm256_add_ps(fiz2,tz);
1422 fjx1 = _mm256_add_ps(fjx1,tx);
1423 fjy1 = _mm256_add_ps(fjy1,ty);
1424 fjz1 = _mm256_add_ps(fjz1,tz);
1426 /**************************
1427 * CALCULATE INTERACTIONS *
1428 **************************/
1430 r22 = _mm256_mul_ps(rsq22,rinv22);
1431 r22 = _mm256_andnot_ps(dummy_mask,r22);
1433 /* Calculate table index by multiplying r with table scale and truncate to integer */
1434 rt = _mm256_mul_ps(r22,vftabscale);
1435 vfitab = _mm256_cvttps_epi32(rt);
1436 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1437 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1438 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1439 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1440 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1441 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1443 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1444 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1445 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1446 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1447 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1448 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1449 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1450 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1451 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1452 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1453 Heps = _mm256_mul_ps(vfeps,H);
1454 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1455 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1456 velec = _mm256_mul_ps(qq22,VV);
1457 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1458 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1460 /* Update potential sum for this i atom from the interaction with this j atom. */
1461 velec = _mm256_andnot_ps(dummy_mask,velec);
1462 velecsum = _mm256_add_ps(velecsum,velec);
1466 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1468 /* Calculate temporary vectorial force */
1469 tx = _mm256_mul_ps(fscal,dx22);
1470 ty = _mm256_mul_ps(fscal,dy22);
1471 tz = _mm256_mul_ps(fscal,dz22);
1473 /* Update vectorial force */
1474 fix2 = _mm256_add_ps(fix2,tx);
1475 fiy2 = _mm256_add_ps(fiy2,ty);
1476 fiz2 = _mm256_add_ps(fiz2,tz);
1478 fjx2 = _mm256_add_ps(fjx2,tx);
1479 fjy2 = _mm256_add_ps(fjy2,ty);
1480 fjz2 = _mm256_add_ps(fjz2,tz);
1482 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1483 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1484 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1485 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1486 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1487 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1488 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1489 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1491 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1492 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1494 /* Inner loop uses 426 flops */
1497 /* End of innermost loop */
1499 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1500 f+i_coord_offset,fshift+i_shift_offset);
1503 /* Update potential energies */
1504 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1505 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1507 /* Increment number of inner iterations */
1508 inneriter += j_index_end - j_index_start;
1510 /* Outer loop uses 20 flops */
1513 /* Increment number of outer iterations */
1516 /* Update outer/inner flops */
1518 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
1521 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single
1522 * Electrostatics interaction: CubicSplineTable
1523 * VdW interaction: CubicSplineTable
1524 * Geometry: Water3-Water3
1525 * Calculate force/pot: Force
1528 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single
1529 (t_nblist * gmx_restrict nlist,
1530 rvec * gmx_restrict xx,
1531 rvec * gmx_restrict ff,
1532 t_forcerec * gmx_restrict fr,
1533 t_mdatoms * gmx_restrict mdatoms,
1534 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1535 t_nrnb * gmx_restrict nrnb)
1537 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1538 * just 0 for non-waters.
1539 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1540 * jnr indices corresponding to data put in the four positions in the SIMD register.
1542 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1543 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1544 int jnrA,jnrB,jnrC,jnrD;
1545 int jnrE,jnrF,jnrG,jnrH;
1546 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1547 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1548 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1549 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1550 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1551 real rcutoff_scalar;
1552 real *shiftvec,*fshift,*x,*f;
1553 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1554 real scratch[4*DIM];
1555 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1556 real * vdwioffsetptr0;
1557 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1558 real * vdwioffsetptr1;
1559 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1560 real * vdwioffsetptr2;
1561 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1562 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1563 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1564 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1565 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1566 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1567 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1568 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1569 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1570 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1571 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1572 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1573 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1574 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1575 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1576 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1577 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1580 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1583 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1584 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1586 __m128i vfitab_lo,vfitab_hi;
1587 __m128i ifour = _mm_set1_epi32(4);
1588 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1590 __m256 dummy_mask,cutoff_mask;
1591 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1592 __m256 one = _mm256_set1_ps(1.0);
1593 __m256 two = _mm256_set1_ps(2.0);
1599 jindex = nlist->jindex;
1601 shiftidx = nlist->shift;
1603 shiftvec = fr->shift_vec[0];
1604 fshift = fr->fshift[0];
1605 facel = _mm256_set1_ps(fr->epsfac);
1606 charge = mdatoms->chargeA;
1607 nvdwtype = fr->ntype;
1608 vdwparam = fr->nbfp;
1609 vdwtype = mdatoms->typeA;
1611 vftab = kernel_data->table_elec_vdw->data;
1612 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
1614 /* Setup water-specific parameters */
1615 inr = nlist->iinr[0];
1616 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1617 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1618 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1619 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1621 jq0 = _mm256_set1_ps(charge[inr+0]);
1622 jq1 = _mm256_set1_ps(charge[inr+1]);
1623 jq2 = _mm256_set1_ps(charge[inr+2]);
1624 vdwjidx0A = 2*vdwtype[inr+0];
1625 qq00 = _mm256_mul_ps(iq0,jq0);
1626 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1627 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1628 qq01 = _mm256_mul_ps(iq0,jq1);
1629 qq02 = _mm256_mul_ps(iq0,jq2);
1630 qq10 = _mm256_mul_ps(iq1,jq0);
1631 qq11 = _mm256_mul_ps(iq1,jq1);
1632 qq12 = _mm256_mul_ps(iq1,jq2);
1633 qq20 = _mm256_mul_ps(iq2,jq0);
1634 qq21 = _mm256_mul_ps(iq2,jq1);
1635 qq22 = _mm256_mul_ps(iq2,jq2);
1637 /* Avoid stupid compiler warnings */
1638 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1639 j_coord_offsetA = 0;
1640 j_coord_offsetB = 0;
1641 j_coord_offsetC = 0;
1642 j_coord_offsetD = 0;
1643 j_coord_offsetE = 0;
1644 j_coord_offsetF = 0;
1645 j_coord_offsetG = 0;
1646 j_coord_offsetH = 0;
1651 for(iidx=0;iidx<4*DIM;iidx++)
1653 scratch[iidx] = 0.0;
1656 /* Start outer loop over neighborlists */
1657 for(iidx=0; iidx<nri; iidx++)
1659 /* Load shift vector for this list */
1660 i_shift_offset = DIM*shiftidx[iidx];
1662 /* Load limits for loop over neighbors */
1663 j_index_start = jindex[iidx];
1664 j_index_end = jindex[iidx+1];
1666 /* Get outer coordinate index */
1668 i_coord_offset = DIM*inr;
1670 /* Load i particle coords and add shift vector */
1671 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1672 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1674 fix0 = _mm256_setzero_ps();
1675 fiy0 = _mm256_setzero_ps();
1676 fiz0 = _mm256_setzero_ps();
1677 fix1 = _mm256_setzero_ps();
1678 fiy1 = _mm256_setzero_ps();
1679 fiz1 = _mm256_setzero_ps();
1680 fix2 = _mm256_setzero_ps();
1681 fiy2 = _mm256_setzero_ps();
1682 fiz2 = _mm256_setzero_ps();
1684 /* Start inner kernel loop */
1685 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1688 /* Get j neighbor index, and coordinate index */
1690 jnrB = jjnr[jidx+1];
1691 jnrC = jjnr[jidx+2];
1692 jnrD = jjnr[jidx+3];
1693 jnrE = jjnr[jidx+4];
1694 jnrF = jjnr[jidx+5];
1695 jnrG = jjnr[jidx+6];
1696 jnrH = jjnr[jidx+7];
1697 j_coord_offsetA = DIM*jnrA;
1698 j_coord_offsetB = DIM*jnrB;
1699 j_coord_offsetC = DIM*jnrC;
1700 j_coord_offsetD = DIM*jnrD;
1701 j_coord_offsetE = DIM*jnrE;
1702 j_coord_offsetF = DIM*jnrF;
1703 j_coord_offsetG = DIM*jnrG;
1704 j_coord_offsetH = DIM*jnrH;
1706 /* load j atom coordinates */
1707 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1708 x+j_coord_offsetC,x+j_coord_offsetD,
1709 x+j_coord_offsetE,x+j_coord_offsetF,
1710 x+j_coord_offsetG,x+j_coord_offsetH,
1711 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1713 /* Calculate displacement vector */
1714 dx00 = _mm256_sub_ps(ix0,jx0);
1715 dy00 = _mm256_sub_ps(iy0,jy0);
1716 dz00 = _mm256_sub_ps(iz0,jz0);
1717 dx01 = _mm256_sub_ps(ix0,jx1);
1718 dy01 = _mm256_sub_ps(iy0,jy1);
1719 dz01 = _mm256_sub_ps(iz0,jz1);
1720 dx02 = _mm256_sub_ps(ix0,jx2);
1721 dy02 = _mm256_sub_ps(iy0,jy2);
1722 dz02 = _mm256_sub_ps(iz0,jz2);
1723 dx10 = _mm256_sub_ps(ix1,jx0);
1724 dy10 = _mm256_sub_ps(iy1,jy0);
1725 dz10 = _mm256_sub_ps(iz1,jz0);
1726 dx11 = _mm256_sub_ps(ix1,jx1);
1727 dy11 = _mm256_sub_ps(iy1,jy1);
1728 dz11 = _mm256_sub_ps(iz1,jz1);
1729 dx12 = _mm256_sub_ps(ix1,jx2);
1730 dy12 = _mm256_sub_ps(iy1,jy2);
1731 dz12 = _mm256_sub_ps(iz1,jz2);
1732 dx20 = _mm256_sub_ps(ix2,jx0);
1733 dy20 = _mm256_sub_ps(iy2,jy0);
1734 dz20 = _mm256_sub_ps(iz2,jz0);
1735 dx21 = _mm256_sub_ps(ix2,jx1);
1736 dy21 = _mm256_sub_ps(iy2,jy1);
1737 dz21 = _mm256_sub_ps(iz2,jz1);
1738 dx22 = _mm256_sub_ps(ix2,jx2);
1739 dy22 = _mm256_sub_ps(iy2,jy2);
1740 dz22 = _mm256_sub_ps(iz2,jz2);
1742 /* Calculate squared distance and things based on it */
1743 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1744 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1745 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1746 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1747 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1748 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1749 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1750 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1751 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1753 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1754 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1755 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1756 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1757 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1758 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1759 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1760 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1761 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1763 fjx0 = _mm256_setzero_ps();
1764 fjy0 = _mm256_setzero_ps();
1765 fjz0 = _mm256_setzero_ps();
1766 fjx1 = _mm256_setzero_ps();
1767 fjy1 = _mm256_setzero_ps();
1768 fjz1 = _mm256_setzero_ps();
1769 fjx2 = _mm256_setzero_ps();
1770 fjy2 = _mm256_setzero_ps();
1771 fjz2 = _mm256_setzero_ps();
1773 /**************************
1774 * CALCULATE INTERACTIONS *
1775 **************************/
1777 r00 = _mm256_mul_ps(rsq00,rinv00);
1779 /* Calculate table index by multiplying r with table scale and truncate to integer */
1780 rt = _mm256_mul_ps(r00,vftabscale);
1781 vfitab = _mm256_cvttps_epi32(rt);
1782 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1783 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1784 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1785 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1786 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1787 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1789 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1790 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1791 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1792 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1793 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1794 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1795 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1796 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1797 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1798 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1799 Heps = _mm256_mul_ps(vfeps,H);
1800 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1801 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1802 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1804 /* CUBIC SPLINE TABLE DISPERSION */
1805 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1806 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1807 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1808 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1809 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1810 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1811 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1812 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1813 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1814 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1815 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1816 Heps = _mm256_mul_ps(vfeps,H);
1817 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1818 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1819 fvdw6 = _mm256_mul_ps(c6_00,FF);
1821 /* CUBIC SPLINE TABLE REPULSION */
1822 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1823 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1824 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1825 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1826 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1827 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1828 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1829 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1830 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1831 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1832 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1833 Heps = _mm256_mul_ps(vfeps,H);
1834 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1835 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1836 fvdw12 = _mm256_mul_ps(c12_00,FF);
1837 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1839 fscal = _mm256_add_ps(felec,fvdw);
1841 /* Calculate temporary vectorial force */
1842 tx = _mm256_mul_ps(fscal,dx00);
1843 ty = _mm256_mul_ps(fscal,dy00);
1844 tz = _mm256_mul_ps(fscal,dz00);
1846 /* Update vectorial force */
1847 fix0 = _mm256_add_ps(fix0,tx);
1848 fiy0 = _mm256_add_ps(fiy0,ty);
1849 fiz0 = _mm256_add_ps(fiz0,tz);
1851 fjx0 = _mm256_add_ps(fjx0,tx);
1852 fjy0 = _mm256_add_ps(fjy0,ty);
1853 fjz0 = _mm256_add_ps(fjz0,tz);
1855 /**************************
1856 * CALCULATE INTERACTIONS *
1857 **************************/
1859 r01 = _mm256_mul_ps(rsq01,rinv01);
1861 /* Calculate table index by multiplying r with table scale and truncate to integer */
1862 rt = _mm256_mul_ps(r01,vftabscale);
1863 vfitab = _mm256_cvttps_epi32(rt);
1864 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1865 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1866 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1867 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1868 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1869 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1871 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1872 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1873 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1874 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1875 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1876 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1877 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1878 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1879 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1880 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1881 Heps = _mm256_mul_ps(vfeps,H);
1882 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1883 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1884 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1888 /* Calculate temporary vectorial force */
1889 tx = _mm256_mul_ps(fscal,dx01);
1890 ty = _mm256_mul_ps(fscal,dy01);
1891 tz = _mm256_mul_ps(fscal,dz01);
1893 /* Update vectorial force */
1894 fix0 = _mm256_add_ps(fix0,tx);
1895 fiy0 = _mm256_add_ps(fiy0,ty);
1896 fiz0 = _mm256_add_ps(fiz0,tz);
1898 fjx1 = _mm256_add_ps(fjx1,tx);
1899 fjy1 = _mm256_add_ps(fjy1,ty);
1900 fjz1 = _mm256_add_ps(fjz1,tz);
1902 /**************************
1903 * CALCULATE INTERACTIONS *
1904 **************************/
1906 r02 = _mm256_mul_ps(rsq02,rinv02);
1908 /* Calculate table index by multiplying r with table scale and truncate to integer */
1909 rt = _mm256_mul_ps(r02,vftabscale);
1910 vfitab = _mm256_cvttps_epi32(rt);
1911 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1912 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1913 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1914 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1915 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1916 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1918 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1919 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1920 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1921 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1922 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1923 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1924 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1925 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1926 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1927 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1928 Heps = _mm256_mul_ps(vfeps,H);
1929 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1930 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1931 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1935 /* Calculate temporary vectorial force */
1936 tx = _mm256_mul_ps(fscal,dx02);
1937 ty = _mm256_mul_ps(fscal,dy02);
1938 tz = _mm256_mul_ps(fscal,dz02);
1940 /* Update vectorial force */
1941 fix0 = _mm256_add_ps(fix0,tx);
1942 fiy0 = _mm256_add_ps(fiy0,ty);
1943 fiz0 = _mm256_add_ps(fiz0,tz);
1945 fjx2 = _mm256_add_ps(fjx2,tx);
1946 fjy2 = _mm256_add_ps(fjy2,ty);
1947 fjz2 = _mm256_add_ps(fjz2,tz);
1949 /**************************
1950 * CALCULATE INTERACTIONS *
1951 **************************/
1953 r10 = _mm256_mul_ps(rsq10,rinv10);
1955 /* Calculate table index by multiplying r with table scale and truncate to integer */
1956 rt = _mm256_mul_ps(r10,vftabscale);
1957 vfitab = _mm256_cvttps_epi32(rt);
1958 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1959 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1960 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1961 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1962 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1963 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1965 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1966 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1967 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1968 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1969 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1970 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1971 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1972 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1973 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1974 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1975 Heps = _mm256_mul_ps(vfeps,H);
1976 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1977 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1978 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1982 /* Calculate temporary vectorial force */
1983 tx = _mm256_mul_ps(fscal,dx10);
1984 ty = _mm256_mul_ps(fscal,dy10);
1985 tz = _mm256_mul_ps(fscal,dz10);
1987 /* Update vectorial force */
1988 fix1 = _mm256_add_ps(fix1,tx);
1989 fiy1 = _mm256_add_ps(fiy1,ty);
1990 fiz1 = _mm256_add_ps(fiz1,tz);
1992 fjx0 = _mm256_add_ps(fjx0,tx);
1993 fjy0 = _mm256_add_ps(fjy0,ty);
1994 fjz0 = _mm256_add_ps(fjz0,tz);
1996 /**************************
1997 * CALCULATE INTERACTIONS *
1998 **************************/
2000 r11 = _mm256_mul_ps(rsq11,rinv11);
2002 /* Calculate table index by multiplying r with table scale and truncate to integer */
2003 rt = _mm256_mul_ps(r11,vftabscale);
2004 vfitab = _mm256_cvttps_epi32(rt);
2005 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2006 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2007 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2008 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2009 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2010 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2012 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2013 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2014 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2015 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2016 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2017 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2018 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2019 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2020 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2021 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2022 Heps = _mm256_mul_ps(vfeps,H);
2023 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2024 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2025 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2029 /* Calculate temporary vectorial force */
2030 tx = _mm256_mul_ps(fscal,dx11);
2031 ty = _mm256_mul_ps(fscal,dy11);
2032 tz = _mm256_mul_ps(fscal,dz11);
2034 /* Update vectorial force */
2035 fix1 = _mm256_add_ps(fix1,tx);
2036 fiy1 = _mm256_add_ps(fiy1,ty);
2037 fiz1 = _mm256_add_ps(fiz1,tz);
2039 fjx1 = _mm256_add_ps(fjx1,tx);
2040 fjy1 = _mm256_add_ps(fjy1,ty);
2041 fjz1 = _mm256_add_ps(fjz1,tz);
2043 /**************************
2044 * CALCULATE INTERACTIONS *
2045 **************************/
2047 r12 = _mm256_mul_ps(rsq12,rinv12);
2049 /* Calculate table index by multiplying r with table scale and truncate to integer */
2050 rt = _mm256_mul_ps(r12,vftabscale);
2051 vfitab = _mm256_cvttps_epi32(rt);
2052 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2053 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2054 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2055 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2056 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2057 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2059 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2060 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2061 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2062 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2063 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2064 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2065 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2066 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2067 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2068 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2069 Heps = _mm256_mul_ps(vfeps,H);
2070 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2071 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2072 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2076 /* Calculate temporary vectorial force */
2077 tx = _mm256_mul_ps(fscal,dx12);
2078 ty = _mm256_mul_ps(fscal,dy12);
2079 tz = _mm256_mul_ps(fscal,dz12);
2081 /* Update vectorial force */
2082 fix1 = _mm256_add_ps(fix1,tx);
2083 fiy1 = _mm256_add_ps(fiy1,ty);
2084 fiz1 = _mm256_add_ps(fiz1,tz);
2086 fjx2 = _mm256_add_ps(fjx2,tx);
2087 fjy2 = _mm256_add_ps(fjy2,ty);
2088 fjz2 = _mm256_add_ps(fjz2,tz);
2090 /**************************
2091 * CALCULATE INTERACTIONS *
2092 **************************/
2094 r20 = _mm256_mul_ps(rsq20,rinv20);
2096 /* Calculate table index by multiplying r with table scale and truncate to integer */
2097 rt = _mm256_mul_ps(r20,vftabscale);
2098 vfitab = _mm256_cvttps_epi32(rt);
2099 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2100 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2101 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2102 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2103 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2104 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2106 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2107 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2108 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2109 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2110 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2111 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2112 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2113 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2114 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2115 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2116 Heps = _mm256_mul_ps(vfeps,H);
2117 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2118 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2119 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2123 /* Calculate temporary vectorial force */
2124 tx = _mm256_mul_ps(fscal,dx20);
2125 ty = _mm256_mul_ps(fscal,dy20);
2126 tz = _mm256_mul_ps(fscal,dz20);
2128 /* Update vectorial force */
2129 fix2 = _mm256_add_ps(fix2,tx);
2130 fiy2 = _mm256_add_ps(fiy2,ty);
2131 fiz2 = _mm256_add_ps(fiz2,tz);
2133 fjx0 = _mm256_add_ps(fjx0,tx);
2134 fjy0 = _mm256_add_ps(fjy0,ty);
2135 fjz0 = _mm256_add_ps(fjz0,tz);
2137 /**************************
2138 * CALCULATE INTERACTIONS *
2139 **************************/
2141 r21 = _mm256_mul_ps(rsq21,rinv21);
2143 /* Calculate table index by multiplying r with table scale and truncate to integer */
2144 rt = _mm256_mul_ps(r21,vftabscale);
2145 vfitab = _mm256_cvttps_epi32(rt);
2146 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2147 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2148 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2149 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2150 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2151 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2153 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2154 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2155 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2156 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2157 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2158 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2159 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2160 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2161 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2162 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2163 Heps = _mm256_mul_ps(vfeps,H);
2164 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2165 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2166 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2170 /* Calculate temporary vectorial force */
2171 tx = _mm256_mul_ps(fscal,dx21);
2172 ty = _mm256_mul_ps(fscal,dy21);
2173 tz = _mm256_mul_ps(fscal,dz21);
2175 /* Update vectorial force */
2176 fix2 = _mm256_add_ps(fix2,tx);
2177 fiy2 = _mm256_add_ps(fiy2,ty);
2178 fiz2 = _mm256_add_ps(fiz2,tz);
2180 fjx1 = _mm256_add_ps(fjx1,tx);
2181 fjy1 = _mm256_add_ps(fjy1,ty);
2182 fjz1 = _mm256_add_ps(fjz1,tz);
2184 /**************************
2185 * CALCULATE INTERACTIONS *
2186 **************************/
2188 r22 = _mm256_mul_ps(rsq22,rinv22);
2190 /* Calculate table index by multiplying r with table scale and truncate to integer */
2191 rt = _mm256_mul_ps(r22,vftabscale);
2192 vfitab = _mm256_cvttps_epi32(rt);
2193 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2194 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2195 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2196 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2197 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2198 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2200 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2201 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2202 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2203 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2204 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2205 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2206 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2207 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2208 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2209 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2210 Heps = _mm256_mul_ps(vfeps,H);
2211 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2212 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2213 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2217 /* Calculate temporary vectorial force */
2218 tx = _mm256_mul_ps(fscal,dx22);
2219 ty = _mm256_mul_ps(fscal,dy22);
2220 tz = _mm256_mul_ps(fscal,dz22);
2222 /* Update vectorial force */
2223 fix2 = _mm256_add_ps(fix2,tx);
2224 fiy2 = _mm256_add_ps(fiy2,ty);
2225 fiz2 = _mm256_add_ps(fiz2,tz);
2227 fjx2 = _mm256_add_ps(fjx2,tx);
2228 fjy2 = _mm256_add_ps(fjy2,ty);
2229 fjz2 = _mm256_add_ps(fjz2,tz);
2231 fjptrA = f+j_coord_offsetA;
2232 fjptrB = f+j_coord_offsetB;
2233 fjptrC = f+j_coord_offsetC;
2234 fjptrD = f+j_coord_offsetD;
2235 fjptrE = f+j_coord_offsetE;
2236 fjptrF = f+j_coord_offsetF;
2237 fjptrG = f+j_coord_offsetG;
2238 fjptrH = f+j_coord_offsetH;
2240 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2241 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2243 /* Inner loop uses 373 flops */
2246 if(jidx<j_index_end)
2249 /* Get j neighbor index, and coordinate index */
2250 jnrlistA = jjnr[jidx];
2251 jnrlistB = jjnr[jidx+1];
2252 jnrlistC = jjnr[jidx+2];
2253 jnrlistD = jjnr[jidx+3];
2254 jnrlistE = jjnr[jidx+4];
2255 jnrlistF = jjnr[jidx+5];
2256 jnrlistG = jjnr[jidx+6];
2257 jnrlistH = jjnr[jidx+7];
2258 /* Sign of each element will be negative for non-real atoms.
2259 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2260 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2262 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2263 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2265 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2266 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2267 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2268 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2269 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2270 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2271 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2272 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2273 j_coord_offsetA = DIM*jnrA;
2274 j_coord_offsetB = DIM*jnrB;
2275 j_coord_offsetC = DIM*jnrC;
2276 j_coord_offsetD = DIM*jnrD;
2277 j_coord_offsetE = DIM*jnrE;
2278 j_coord_offsetF = DIM*jnrF;
2279 j_coord_offsetG = DIM*jnrG;
2280 j_coord_offsetH = DIM*jnrH;
2282 /* load j atom coordinates */
2283 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2284 x+j_coord_offsetC,x+j_coord_offsetD,
2285 x+j_coord_offsetE,x+j_coord_offsetF,
2286 x+j_coord_offsetG,x+j_coord_offsetH,
2287 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2289 /* Calculate displacement vector */
2290 dx00 = _mm256_sub_ps(ix0,jx0);
2291 dy00 = _mm256_sub_ps(iy0,jy0);
2292 dz00 = _mm256_sub_ps(iz0,jz0);
2293 dx01 = _mm256_sub_ps(ix0,jx1);
2294 dy01 = _mm256_sub_ps(iy0,jy1);
2295 dz01 = _mm256_sub_ps(iz0,jz1);
2296 dx02 = _mm256_sub_ps(ix0,jx2);
2297 dy02 = _mm256_sub_ps(iy0,jy2);
2298 dz02 = _mm256_sub_ps(iz0,jz2);
2299 dx10 = _mm256_sub_ps(ix1,jx0);
2300 dy10 = _mm256_sub_ps(iy1,jy0);
2301 dz10 = _mm256_sub_ps(iz1,jz0);
2302 dx11 = _mm256_sub_ps(ix1,jx1);
2303 dy11 = _mm256_sub_ps(iy1,jy1);
2304 dz11 = _mm256_sub_ps(iz1,jz1);
2305 dx12 = _mm256_sub_ps(ix1,jx2);
2306 dy12 = _mm256_sub_ps(iy1,jy2);
2307 dz12 = _mm256_sub_ps(iz1,jz2);
2308 dx20 = _mm256_sub_ps(ix2,jx0);
2309 dy20 = _mm256_sub_ps(iy2,jy0);
2310 dz20 = _mm256_sub_ps(iz2,jz0);
2311 dx21 = _mm256_sub_ps(ix2,jx1);
2312 dy21 = _mm256_sub_ps(iy2,jy1);
2313 dz21 = _mm256_sub_ps(iz2,jz1);
2314 dx22 = _mm256_sub_ps(ix2,jx2);
2315 dy22 = _mm256_sub_ps(iy2,jy2);
2316 dz22 = _mm256_sub_ps(iz2,jz2);
2318 /* Calculate squared distance and things based on it */
2319 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2320 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2321 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2322 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2323 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2324 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2325 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2326 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2327 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2329 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2330 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
2331 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
2332 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
2333 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2334 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2335 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
2336 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2337 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2339 fjx0 = _mm256_setzero_ps();
2340 fjy0 = _mm256_setzero_ps();
2341 fjz0 = _mm256_setzero_ps();
2342 fjx1 = _mm256_setzero_ps();
2343 fjy1 = _mm256_setzero_ps();
2344 fjz1 = _mm256_setzero_ps();
2345 fjx2 = _mm256_setzero_ps();
2346 fjy2 = _mm256_setzero_ps();
2347 fjz2 = _mm256_setzero_ps();
2349 /**************************
2350 * CALCULATE INTERACTIONS *
2351 **************************/
2353 r00 = _mm256_mul_ps(rsq00,rinv00);
2354 r00 = _mm256_andnot_ps(dummy_mask,r00);
2356 /* Calculate table index by multiplying r with table scale and truncate to integer */
2357 rt = _mm256_mul_ps(r00,vftabscale);
2358 vfitab = _mm256_cvttps_epi32(rt);
2359 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2360 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2361 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2362 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2363 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2364 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2366 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2367 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2368 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2369 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2370 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2371 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2372 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2373 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2374 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2375 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2376 Heps = _mm256_mul_ps(vfeps,H);
2377 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2378 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2379 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2381 /* CUBIC SPLINE TABLE DISPERSION */
2382 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2383 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2384 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2385 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2386 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2387 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2388 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2389 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2390 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2391 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2392 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2393 Heps = _mm256_mul_ps(vfeps,H);
2394 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2395 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2396 fvdw6 = _mm256_mul_ps(c6_00,FF);
2398 /* CUBIC SPLINE TABLE REPULSION */
2399 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2400 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2401 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2402 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2403 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2404 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2405 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2406 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2407 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2408 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2409 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2410 Heps = _mm256_mul_ps(vfeps,H);
2411 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2412 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2413 fvdw12 = _mm256_mul_ps(c12_00,FF);
2414 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
2416 fscal = _mm256_add_ps(felec,fvdw);
2418 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2420 /* Calculate temporary vectorial force */
2421 tx = _mm256_mul_ps(fscal,dx00);
2422 ty = _mm256_mul_ps(fscal,dy00);
2423 tz = _mm256_mul_ps(fscal,dz00);
2425 /* Update vectorial force */
2426 fix0 = _mm256_add_ps(fix0,tx);
2427 fiy0 = _mm256_add_ps(fiy0,ty);
2428 fiz0 = _mm256_add_ps(fiz0,tz);
2430 fjx0 = _mm256_add_ps(fjx0,tx);
2431 fjy0 = _mm256_add_ps(fjy0,ty);
2432 fjz0 = _mm256_add_ps(fjz0,tz);
2434 /**************************
2435 * CALCULATE INTERACTIONS *
2436 **************************/
2438 r01 = _mm256_mul_ps(rsq01,rinv01);
2439 r01 = _mm256_andnot_ps(dummy_mask,r01);
2441 /* Calculate table index by multiplying r with table scale and truncate to integer */
2442 rt = _mm256_mul_ps(r01,vftabscale);
2443 vfitab = _mm256_cvttps_epi32(rt);
2444 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2445 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2446 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2447 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2448 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2449 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2451 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2452 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2453 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2454 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2455 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2456 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2457 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2458 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2459 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2460 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2461 Heps = _mm256_mul_ps(vfeps,H);
2462 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2463 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2464 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2468 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2470 /* Calculate temporary vectorial force */
2471 tx = _mm256_mul_ps(fscal,dx01);
2472 ty = _mm256_mul_ps(fscal,dy01);
2473 tz = _mm256_mul_ps(fscal,dz01);
2475 /* Update vectorial force */
2476 fix0 = _mm256_add_ps(fix0,tx);
2477 fiy0 = _mm256_add_ps(fiy0,ty);
2478 fiz0 = _mm256_add_ps(fiz0,tz);
2480 fjx1 = _mm256_add_ps(fjx1,tx);
2481 fjy1 = _mm256_add_ps(fjy1,ty);
2482 fjz1 = _mm256_add_ps(fjz1,tz);
2484 /**************************
2485 * CALCULATE INTERACTIONS *
2486 **************************/
2488 r02 = _mm256_mul_ps(rsq02,rinv02);
2489 r02 = _mm256_andnot_ps(dummy_mask,r02);
2491 /* Calculate table index by multiplying r with table scale and truncate to integer */
2492 rt = _mm256_mul_ps(r02,vftabscale);
2493 vfitab = _mm256_cvttps_epi32(rt);
2494 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2495 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2496 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2497 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2498 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2499 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2501 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2502 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2503 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2504 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2505 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2506 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2507 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2508 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2509 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2510 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2511 Heps = _mm256_mul_ps(vfeps,H);
2512 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2513 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2514 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2518 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2520 /* Calculate temporary vectorial force */
2521 tx = _mm256_mul_ps(fscal,dx02);
2522 ty = _mm256_mul_ps(fscal,dy02);
2523 tz = _mm256_mul_ps(fscal,dz02);
2525 /* Update vectorial force */
2526 fix0 = _mm256_add_ps(fix0,tx);
2527 fiy0 = _mm256_add_ps(fiy0,ty);
2528 fiz0 = _mm256_add_ps(fiz0,tz);
2530 fjx2 = _mm256_add_ps(fjx2,tx);
2531 fjy2 = _mm256_add_ps(fjy2,ty);
2532 fjz2 = _mm256_add_ps(fjz2,tz);
2534 /**************************
2535 * CALCULATE INTERACTIONS *
2536 **************************/
2538 r10 = _mm256_mul_ps(rsq10,rinv10);
2539 r10 = _mm256_andnot_ps(dummy_mask,r10);
2541 /* Calculate table index by multiplying r with table scale and truncate to integer */
2542 rt = _mm256_mul_ps(r10,vftabscale);
2543 vfitab = _mm256_cvttps_epi32(rt);
2544 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2545 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2546 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2547 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2548 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2549 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2551 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2552 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2553 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2554 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2555 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2556 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2557 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2558 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2559 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2560 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2561 Heps = _mm256_mul_ps(vfeps,H);
2562 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2563 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2564 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2568 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2570 /* Calculate temporary vectorial force */
2571 tx = _mm256_mul_ps(fscal,dx10);
2572 ty = _mm256_mul_ps(fscal,dy10);
2573 tz = _mm256_mul_ps(fscal,dz10);
2575 /* Update vectorial force */
2576 fix1 = _mm256_add_ps(fix1,tx);
2577 fiy1 = _mm256_add_ps(fiy1,ty);
2578 fiz1 = _mm256_add_ps(fiz1,tz);
2580 fjx0 = _mm256_add_ps(fjx0,tx);
2581 fjy0 = _mm256_add_ps(fjy0,ty);
2582 fjz0 = _mm256_add_ps(fjz0,tz);
2584 /**************************
2585 * CALCULATE INTERACTIONS *
2586 **************************/
2588 r11 = _mm256_mul_ps(rsq11,rinv11);
2589 r11 = _mm256_andnot_ps(dummy_mask,r11);
2591 /* Calculate table index by multiplying r with table scale and truncate to integer */
2592 rt = _mm256_mul_ps(r11,vftabscale);
2593 vfitab = _mm256_cvttps_epi32(rt);
2594 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2595 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2596 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2597 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2598 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2599 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2601 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2602 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2603 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2604 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2605 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2606 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2607 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2608 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2609 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2610 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2611 Heps = _mm256_mul_ps(vfeps,H);
2612 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2613 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2614 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2618 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2620 /* Calculate temporary vectorial force */
2621 tx = _mm256_mul_ps(fscal,dx11);
2622 ty = _mm256_mul_ps(fscal,dy11);
2623 tz = _mm256_mul_ps(fscal,dz11);
2625 /* Update vectorial force */
2626 fix1 = _mm256_add_ps(fix1,tx);
2627 fiy1 = _mm256_add_ps(fiy1,ty);
2628 fiz1 = _mm256_add_ps(fiz1,tz);
2630 fjx1 = _mm256_add_ps(fjx1,tx);
2631 fjy1 = _mm256_add_ps(fjy1,ty);
2632 fjz1 = _mm256_add_ps(fjz1,tz);
2634 /**************************
2635 * CALCULATE INTERACTIONS *
2636 **************************/
2638 r12 = _mm256_mul_ps(rsq12,rinv12);
2639 r12 = _mm256_andnot_ps(dummy_mask,r12);
2641 /* Calculate table index by multiplying r with table scale and truncate to integer */
2642 rt = _mm256_mul_ps(r12,vftabscale);
2643 vfitab = _mm256_cvttps_epi32(rt);
2644 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2645 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2646 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2647 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2648 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2649 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2651 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2652 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2653 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2654 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2655 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2656 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2657 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2658 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2659 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2660 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2661 Heps = _mm256_mul_ps(vfeps,H);
2662 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2663 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2664 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2668 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2670 /* Calculate temporary vectorial force */
2671 tx = _mm256_mul_ps(fscal,dx12);
2672 ty = _mm256_mul_ps(fscal,dy12);
2673 tz = _mm256_mul_ps(fscal,dz12);
2675 /* Update vectorial force */
2676 fix1 = _mm256_add_ps(fix1,tx);
2677 fiy1 = _mm256_add_ps(fiy1,ty);
2678 fiz1 = _mm256_add_ps(fiz1,tz);
2680 fjx2 = _mm256_add_ps(fjx2,tx);
2681 fjy2 = _mm256_add_ps(fjy2,ty);
2682 fjz2 = _mm256_add_ps(fjz2,tz);
2684 /**************************
2685 * CALCULATE INTERACTIONS *
2686 **************************/
2688 r20 = _mm256_mul_ps(rsq20,rinv20);
2689 r20 = _mm256_andnot_ps(dummy_mask,r20);
2691 /* Calculate table index by multiplying r with table scale and truncate to integer */
2692 rt = _mm256_mul_ps(r20,vftabscale);
2693 vfitab = _mm256_cvttps_epi32(rt);
2694 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2695 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2696 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2697 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2698 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2699 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2701 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2702 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2703 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2704 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2705 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2706 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2707 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2708 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2709 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2710 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2711 Heps = _mm256_mul_ps(vfeps,H);
2712 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2713 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2714 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2718 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2720 /* Calculate temporary vectorial force */
2721 tx = _mm256_mul_ps(fscal,dx20);
2722 ty = _mm256_mul_ps(fscal,dy20);
2723 tz = _mm256_mul_ps(fscal,dz20);
2725 /* Update vectorial force */
2726 fix2 = _mm256_add_ps(fix2,tx);
2727 fiy2 = _mm256_add_ps(fiy2,ty);
2728 fiz2 = _mm256_add_ps(fiz2,tz);
2730 fjx0 = _mm256_add_ps(fjx0,tx);
2731 fjy0 = _mm256_add_ps(fjy0,ty);
2732 fjz0 = _mm256_add_ps(fjz0,tz);
2734 /**************************
2735 * CALCULATE INTERACTIONS *
2736 **************************/
2738 r21 = _mm256_mul_ps(rsq21,rinv21);
2739 r21 = _mm256_andnot_ps(dummy_mask,r21);
2741 /* Calculate table index by multiplying r with table scale and truncate to integer */
2742 rt = _mm256_mul_ps(r21,vftabscale);
2743 vfitab = _mm256_cvttps_epi32(rt);
2744 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2745 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2746 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2747 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2748 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2749 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2751 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2752 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2753 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2754 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2755 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2756 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2757 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2758 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2759 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2760 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2761 Heps = _mm256_mul_ps(vfeps,H);
2762 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2763 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2764 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2768 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2770 /* Calculate temporary vectorial force */
2771 tx = _mm256_mul_ps(fscal,dx21);
2772 ty = _mm256_mul_ps(fscal,dy21);
2773 tz = _mm256_mul_ps(fscal,dz21);
2775 /* Update vectorial force */
2776 fix2 = _mm256_add_ps(fix2,tx);
2777 fiy2 = _mm256_add_ps(fiy2,ty);
2778 fiz2 = _mm256_add_ps(fiz2,tz);
2780 fjx1 = _mm256_add_ps(fjx1,tx);
2781 fjy1 = _mm256_add_ps(fjy1,ty);
2782 fjz1 = _mm256_add_ps(fjz1,tz);
2784 /**************************
2785 * CALCULATE INTERACTIONS *
2786 **************************/
2788 r22 = _mm256_mul_ps(rsq22,rinv22);
2789 r22 = _mm256_andnot_ps(dummy_mask,r22);
2791 /* Calculate table index by multiplying r with table scale and truncate to integer */
2792 rt = _mm256_mul_ps(r22,vftabscale);
2793 vfitab = _mm256_cvttps_epi32(rt);
2794 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2795 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2796 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2797 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2798 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2799 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2801 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2802 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2803 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2804 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2805 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2806 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2807 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2808 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2809 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2810 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2811 Heps = _mm256_mul_ps(vfeps,H);
2812 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2813 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2814 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2818 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2820 /* Calculate temporary vectorial force */
2821 tx = _mm256_mul_ps(fscal,dx22);
2822 ty = _mm256_mul_ps(fscal,dy22);
2823 tz = _mm256_mul_ps(fscal,dz22);
2825 /* Update vectorial force */
2826 fix2 = _mm256_add_ps(fix2,tx);
2827 fiy2 = _mm256_add_ps(fiy2,ty);
2828 fiz2 = _mm256_add_ps(fiz2,tz);
2830 fjx2 = _mm256_add_ps(fjx2,tx);
2831 fjy2 = _mm256_add_ps(fjy2,ty);
2832 fjz2 = _mm256_add_ps(fjz2,tz);
2834 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2835 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2836 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2837 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2838 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2839 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2840 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2841 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2843 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2844 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2846 /* Inner loop uses 382 flops */
2849 /* End of innermost loop */
2851 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2852 f+i_coord_offset,fshift+i_shift_offset);
2854 /* Increment number of inner iterations */
2855 inneriter += j_index_end - j_index_start;
2857 /* Outer loop uses 18 flops */
2860 /* Increment number of outer iterations */
2863 /* Update outer/inner flops */
2865 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);