2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_VF_avx_256_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrE,jnrF,jnrG,jnrH;
76 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
85 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
93 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
95 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
97 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
110 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
114 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
116 __m128i vfitab_lo,vfitab_hi;
117 __m128i ifour = _mm_set1_epi32(4);
118 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
120 __m256 dummy_mask,cutoff_mask;
121 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
122 __m256 one = _mm256_set1_ps(1.0);
123 __m256 two = _mm256_set1_ps(2.0);
129 jindex = nlist->jindex;
131 shiftidx = nlist->shift;
133 shiftvec = fr->shift_vec[0];
134 fshift = fr->fshift[0];
135 facel = _mm256_set1_ps(fr->epsfac);
136 charge = mdatoms->chargeA;
137 nvdwtype = fr->ntype;
139 vdwtype = mdatoms->typeA;
141 vftab = kernel_data->table_elec_vdw->data;
142 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
144 /* Setup water-specific parameters */
145 inr = nlist->iinr[0];
146 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
147 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
148 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
149 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
151 jq0 = _mm256_set1_ps(charge[inr+0]);
152 jq1 = _mm256_set1_ps(charge[inr+1]);
153 jq2 = _mm256_set1_ps(charge[inr+2]);
154 vdwjidx0A = 2*vdwtype[inr+0];
155 qq00 = _mm256_mul_ps(iq0,jq0);
156 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
157 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
158 qq01 = _mm256_mul_ps(iq0,jq1);
159 qq02 = _mm256_mul_ps(iq0,jq2);
160 qq10 = _mm256_mul_ps(iq1,jq0);
161 qq11 = _mm256_mul_ps(iq1,jq1);
162 qq12 = _mm256_mul_ps(iq1,jq2);
163 qq20 = _mm256_mul_ps(iq2,jq0);
164 qq21 = _mm256_mul_ps(iq2,jq1);
165 qq22 = _mm256_mul_ps(iq2,jq2);
167 /* Avoid stupid compiler warnings */
168 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
181 for(iidx=0;iidx<4*DIM;iidx++)
186 /* Start outer loop over neighborlists */
187 for(iidx=0; iidx<nri; iidx++)
189 /* Load shift vector for this list */
190 i_shift_offset = DIM*shiftidx[iidx];
192 /* Load limits for loop over neighbors */
193 j_index_start = jindex[iidx];
194 j_index_end = jindex[iidx+1];
196 /* Get outer coordinate index */
198 i_coord_offset = DIM*inr;
200 /* Load i particle coords and add shift vector */
201 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
202 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
204 fix0 = _mm256_setzero_ps();
205 fiy0 = _mm256_setzero_ps();
206 fiz0 = _mm256_setzero_ps();
207 fix1 = _mm256_setzero_ps();
208 fiy1 = _mm256_setzero_ps();
209 fiz1 = _mm256_setzero_ps();
210 fix2 = _mm256_setzero_ps();
211 fiy2 = _mm256_setzero_ps();
212 fiz2 = _mm256_setzero_ps();
214 /* Reset potential sums */
215 velecsum = _mm256_setzero_ps();
216 vvdwsum = _mm256_setzero_ps();
218 /* Start inner kernel loop */
219 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
222 /* Get j neighbor index, and coordinate index */
231 j_coord_offsetA = DIM*jnrA;
232 j_coord_offsetB = DIM*jnrB;
233 j_coord_offsetC = DIM*jnrC;
234 j_coord_offsetD = DIM*jnrD;
235 j_coord_offsetE = DIM*jnrE;
236 j_coord_offsetF = DIM*jnrF;
237 j_coord_offsetG = DIM*jnrG;
238 j_coord_offsetH = DIM*jnrH;
240 /* load j atom coordinates */
241 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
242 x+j_coord_offsetC,x+j_coord_offsetD,
243 x+j_coord_offsetE,x+j_coord_offsetF,
244 x+j_coord_offsetG,x+j_coord_offsetH,
245 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
247 /* Calculate displacement vector */
248 dx00 = _mm256_sub_ps(ix0,jx0);
249 dy00 = _mm256_sub_ps(iy0,jy0);
250 dz00 = _mm256_sub_ps(iz0,jz0);
251 dx01 = _mm256_sub_ps(ix0,jx1);
252 dy01 = _mm256_sub_ps(iy0,jy1);
253 dz01 = _mm256_sub_ps(iz0,jz1);
254 dx02 = _mm256_sub_ps(ix0,jx2);
255 dy02 = _mm256_sub_ps(iy0,jy2);
256 dz02 = _mm256_sub_ps(iz0,jz2);
257 dx10 = _mm256_sub_ps(ix1,jx0);
258 dy10 = _mm256_sub_ps(iy1,jy0);
259 dz10 = _mm256_sub_ps(iz1,jz0);
260 dx11 = _mm256_sub_ps(ix1,jx1);
261 dy11 = _mm256_sub_ps(iy1,jy1);
262 dz11 = _mm256_sub_ps(iz1,jz1);
263 dx12 = _mm256_sub_ps(ix1,jx2);
264 dy12 = _mm256_sub_ps(iy1,jy2);
265 dz12 = _mm256_sub_ps(iz1,jz2);
266 dx20 = _mm256_sub_ps(ix2,jx0);
267 dy20 = _mm256_sub_ps(iy2,jy0);
268 dz20 = _mm256_sub_ps(iz2,jz0);
269 dx21 = _mm256_sub_ps(ix2,jx1);
270 dy21 = _mm256_sub_ps(iy2,jy1);
271 dz21 = _mm256_sub_ps(iz2,jz1);
272 dx22 = _mm256_sub_ps(ix2,jx2);
273 dy22 = _mm256_sub_ps(iy2,jy2);
274 dz22 = _mm256_sub_ps(iz2,jz2);
276 /* Calculate squared distance and things based on it */
277 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
278 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
279 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
280 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
281 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
282 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
283 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
284 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
285 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
287 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
288 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
289 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
290 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
291 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
292 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
293 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
294 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
295 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
297 fjx0 = _mm256_setzero_ps();
298 fjy0 = _mm256_setzero_ps();
299 fjz0 = _mm256_setzero_ps();
300 fjx1 = _mm256_setzero_ps();
301 fjy1 = _mm256_setzero_ps();
302 fjz1 = _mm256_setzero_ps();
303 fjx2 = _mm256_setzero_ps();
304 fjy2 = _mm256_setzero_ps();
305 fjz2 = _mm256_setzero_ps();
307 /**************************
308 * CALCULATE INTERACTIONS *
309 **************************/
311 r00 = _mm256_mul_ps(rsq00,rinv00);
313 /* Calculate table index by multiplying r with table scale and truncate to integer */
314 rt = _mm256_mul_ps(r00,vftabscale);
315 vfitab = _mm256_cvttps_epi32(rt);
316 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
317 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
318 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
319 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
320 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
321 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
323 /* CUBIC SPLINE TABLE ELECTROSTATICS */
324 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
325 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
326 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
327 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
328 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
329 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
330 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
331 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
332 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
333 Heps = _mm256_mul_ps(vfeps,H);
334 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
335 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
336 velec = _mm256_mul_ps(qq00,VV);
337 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
338 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
340 /* CUBIC SPLINE TABLE DISPERSION */
341 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
342 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
343 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
344 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
345 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
346 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
347 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
348 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
349 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
350 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
351 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
352 Heps = _mm256_mul_ps(vfeps,H);
353 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
354 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
355 vvdw6 = _mm256_mul_ps(c6_00,VV);
356 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
357 fvdw6 = _mm256_mul_ps(c6_00,FF);
359 /* CUBIC SPLINE TABLE REPULSION */
360 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
361 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
362 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
363 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
364 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
365 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
366 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
367 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
368 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
369 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
370 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
371 Heps = _mm256_mul_ps(vfeps,H);
372 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
373 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
374 vvdw12 = _mm256_mul_ps(c12_00,VV);
375 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
376 fvdw12 = _mm256_mul_ps(c12_00,FF);
377 vvdw = _mm256_add_ps(vvdw12,vvdw6);
378 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
380 /* Update potential sum for this i atom from the interaction with this j atom. */
381 velecsum = _mm256_add_ps(velecsum,velec);
382 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
384 fscal = _mm256_add_ps(felec,fvdw);
386 /* Calculate temporary vectorial force */
387 tx = _mm256_mul_ps(fscal,dx00);
388 ty = _mm256_mul_ps(fscal,dy00);
389 tz = _mm256_mul_ps(fscal,dz00);
391 /* Update vectorial force */
392 fix0 = _mm256_add_ps(fix0,tx);
393 fiy0 = _mm256_add_ps(fiy0,ty);
394 fiz0 = _mm256_add_ps(fiz0,tz);
396 fjx0 = _mm256_add_ps(fjx0,tx);
397 fjy0 = _mm256_add_ps(fjy0,ty);
398 fjz0 = _mm256_add_ps(fjz0,tz);
400 /**************************
401 * CALCULATE INTERACTIONS *
402 **************************/
404 r01 = _mm256_mul_ps(rsq01,rinv01);
406 /* Calculate table index by multiplying r with table scale and truncate to integer */
407 rt = _mm256_mul_ps(r01,vftabscale);
408 vfitab = _mm256_cvttps_epi32(rt);
409 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
410 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
411 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
412 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
413 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
414 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
416 /* CUBIC SPLINE TABLE ELECTROSTATICS */
417 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
418 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
419 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
420 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
421 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
422 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
423 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
424 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
425 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
426 Heps = _mm256_mul_ps(vfeps,H);
427 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
428 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
429 velec = _mm256_mul_ps(qq01,VV);
430 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
431 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
433 /* Update potential sum for this i atom from the interaction with this j atom. */
434 velecsum = _mm256_add_ps(velecsum,velec);
438 /* Calculate temporary vectorial force */
439 tx = _mm256_mul_ps(fscal,dx01);
440 ty = _mm256_mul_ps(fscal,dy01);
441 tz = _mm256_mul_ps(fscal,dz01);
443 /* Update vectorial force */
444 fix0 = _mm256_add_ps(fix0,tx);
445 fiy0 = _mm256_add_ps(fiy0,ty);
446 fiz0 = _mm256_add_ps(fiz0,tz);
448 fjx1 = _mm256_add_ps(fjx1,tx);
449 fjy1 = _mm256_add_ps(fjy1,ty);
450 fjz1 = _mm256_add_ps(fjz1,tz);
452 /**************************
453 * CALCULATE INTERACTIONS *
454 **************************/
456 r02 = _mm256_mul_ps(rsq02,rinv02);
458 /* Calculate table index by multiplying r with table scale and truncate to integer */
459 rt = _mm256_mul_ps(r02,vftabscale);
460 vfitab = _mm256_cvttps_epi32(rt);
461 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
462 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
463 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
464 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
465 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
466 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
468 /* CUBIC SPLINE TABLE ELECTROSTATICS */
469 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
470 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
471 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
472 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
473 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
474 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
475 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
476 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
477 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
478 Heps = _mm256_mul_ps(vfeps,H);
479 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
480 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
481 velec = _mm256_mul_ps(qq02,VV);
482 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
483 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
485 /* Update potential sum for this i atom from the interaction with this j atom. */
486 velecsum = _mm256_add_ps(velecsum,velec);
490 /* Calculate temporary vectorial force */
491 tx = _mm256_mul_ps(fscal,dx02);
492 ty = _mm256_mul_ps(fscal,dy02);
493 tz = _mm256_mul_ps(fscal,dz02);
495 /* Update vectorial force */
496 fix0 = _mm256_add_ps(fix0,tx);
497 fiy0 = _mm256_add_ps(fiy0,ty);
498 fiz0 = _mm256_add_ps(fiz0,tz);
500 fjx2 = _mm256_add_ps(fjx2,tx);
501 fjy2 = _mm256_add_ps(fjy2,ty);
502 fjz2 = _mm256_add_ps(fjz2,tz);
504 /**************************
505 * CALCULATE INTERACTIONS *
506 **************************/
508 r10 = _mm256_mul_ps(rsq10,rinv10);
510 /* Calculate table index by multiplying r with table scale and truncate to integer */
511 rt = _mm256_mul_ps(r10,vftabscale);
512 vfitab = _mm256_cvttps_epi32(rt);
513 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
514 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
515 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
516 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
517 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
518 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
520 /* CUBIC SPLINE TABLE ELECTROSTATICS */
521 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
522 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
523 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
524 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
525 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
526 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
527 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
528 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
529 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
530 Heps = _mm256_mul_ps(vfeps,H);
531 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
532 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
533 velec = _mm256_mul_ps(qq10,VV);
534 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
535 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
537 /* Update potential sum for this i atom from the interaction with this j atom. */
538 velecsum = _mm256_add_ps(velecsum,velec);
542 /* Calculate temporary vectorial force */
543 tx = _mm256_mul_ps(fscal,dx10);
544 ty = _mm256_mul_ps(fscal,dy10);
545 tz = _mm256_mul_ps(fscal,dz10);
547 /* Update vectorial force */
548 fix1 = _mm256_add_ps(fix1,tx);
549 fiy1 = _mm256_add_ps(fiy1,ty);
550 fiz1 = _mm256_add_ps(fiz1,tz);
552 fjx0 = _mm256_add_ps(fjx0,tx);
553 fjy0 = _mm256_add_ps(fjy0,ty);
554 fjz0 = _mm256_add_ps(fjz0,tz);
556 /**************************
557 * CALCULATE INTERACTIONS *
558 **************************/
560 r11 = _mm256_mul_ps(rsq11,rinv11);
562 /* Calculate table index by multiplying r with table scale and truncate to integer */
563 rt = _mm256_mul_ps(r11,vftabscale);
564 vfitab = _mm256_cvttps_epi32(rt);
565 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
566 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
567 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
568 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
569 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
570 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
572 /* CUBIC SPLINE TABLE ELECTROSTATICS */
573 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
574 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
575 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
576 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
577 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
578 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
579 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
580 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
581 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
582 Heps = _mm256_mul_ps(vfeps,H);
583 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
584 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
585 velec = _mm256_mul_ps(qq11,VV);
586 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
587 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
589 /* Update potential sum for this i atom from the interaction with this j atom. */
590 velecsum = _mm256_add_ps(velecsum,velec);
594 /* Calculate temporary vectorial force */
595 tx = _mm256_mul_ps(fscal,dx11);
596 ty = _mm256_mul_ps(fscal,dy11);
597 tz = _mm256_mul_ps(fscal,dz11);
599 /* Update vectorial force */
600 fix1 = _mm256_add_ps(fix1,tx);
601 fiy1 = _mm256_add_ps(fiy1,ty);
602 fiz1 = _mm256_add_ps(fiz1,tz);
604 fjx1 = _mm256_add_ps(fjx1,tx);
605 fjy1 = _mm256_add_ps(fjy1,ty);
606 fjz1 = _mm256_add_ps(fjz1,tz);
608 /**************************
609 * CALCULATE INTERACTIONS *
610 **************************/
612 r12 = _mm256_mul_ps(rsq12,rinv12);
614 /* Calculate table index by multiplying r with table scale and truncate to integer */
615 rt = _mm256_mul_ps(r12,vftabscale);
616 vfitab = _mm256_cvttps_epi32(rt);
617 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
618 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
619 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
620 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
621 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
622 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
624 /* CUBIC SPLINE TABLE ELECTROSTATICS */
625 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
626 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
627 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
628 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
629 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
630 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
631 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
632 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
633 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
634 Heps = _mm256_mul_ps(vfeps,H);
635 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
636 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
637 velec = _mm256_mul_ps(qq12,VV);
638 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
639 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
641 /* Update potential sum for this i atom from the interaction with this j atom. */
642 velecsum = _mm256_add_ps(velecsum,velec);
646 /* Calculate temporary vectorial force */
647 tx = _mm256_mul_ps(fscal,dx12);
648 ty = _mm256_mul_ps(fscal,dy12);
649 tz = _mm256_mul_ps(fscal,dz12);
651 /* Update vectorial force */
652 fix1 = _mm256_add_ps(fix1,tx);
653 fiy1 = _mm256_add_ps(fiy1,ty);
654 fiz1 = _mm256_add_ps(fiz1,tz);
656 fjx2 = _mm256_add_ps(fjx2,tx);
657 fjy2 = _mm256_add_ps(fjy2,ty);
658 fjz2 = _mm256_add_ps(fjz2,tz);
660 /**************************
661 * CALCULATE INTERACTIONS *
662 **************************/
664 r20 = _mm256_mul_ps(rsq20,rinv20);
666 /* Calculate table index by multiplying r with table scale and truncate to integer */
667 rt = _mm256_mul_ps(r20,vftabscale);
668 vfitab = _mm256_cvttps_epi32(rt);
669 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
670 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
671 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
672 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
673 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
674 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
676 /* CUBIC SPLINE TABLE ELECTROSTATICS */
677 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
678 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
679 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
680 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
681 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
682 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
683 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
684 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
685 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
686 Heps = _mm256_mul_ps(vfeps,H);
687 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
688 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
689 velec = _mm256_mul_ps(qq20,VV);
690 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
691 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velecsum = _mm256_add_ps(velecsum,velec);
698 /* Calculate temporary vectorial force */
699 tx = _mm256_mul_ps(fscal,dx20);
700 ty = _mm256_mul_ps(fscal,dy20);
701 tz = _mm256_mul_ps(fscal,dz20);
703 /* Update vectorial force */
704 fix2 = _mm256_add_ps(fix2,tx);
705 fiy2 = _mm256_add_ps(fiy2,ty);
706 fiz2 = _mm256_add_ps(fiz2,tz);
708 fjx0 = _mm256_add_ps(fjx0,tx);
709 fjy0 = _mm256_add_ps(fjy0,ty);
710 fjz0 = _mm256_add_ps(fjz0,tz);
712 /**************************
713 * CALCULATE INTERACTIONS *
714 **************************/
716 r21 = _mm256_mul_ps(rsq21,rinv21);
718 /* Calculate table index by multiplying r with table scale and truncate to integer */
719 rt = _mm256_mul_ps(r21,vftabscale);
720 vfitab = _mm256_cvttps_epi32(rt);
721 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
722 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
723 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
724 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
725 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
726 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
728 /* CUBIC SPLINE TABLE ELECTROSTATICS */
729 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
730 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
731 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
732 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
733 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
734 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
735 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
736 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
737 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
738 Heps = _mm256_mul_ps(vfeps,H);
739 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
740 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
741 velec = _mm256_mul_ps(qq21,VV);
742 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
743 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
745 /* Update potential sum for this i atom from the interaction with this j atom. */
746 velecsum = _mm256_add_ps(velecsum,velec);
750 /* Calculate temporary vectorial force */
751 tx = _mm256_mul_ps(fscal,dx21);
752 ty = _mm256_mul_ps(fscal,dy21);
753 tz = _mm256_mul_ps(fscal,dz21);
755 /* Update vectorial force */
756 fix2 = _mm256_add_ps(fix2,tx);
757 fiy2 = _mm256_add_ps(fiy2,ty);
758 fiz2 = _mm256_add_ps(fiz2,tz);
760 fjx1 = _mm256_add_ps(fjx1,tx);
761 fjy1 = _mm256_add_ps(fjy1,ty);
762 fjz1 = _mm256_add_ps(fjz1,tz);
764 /**************************
765 * CALCULATE INTERACTIONS *
766 **************************/
768 r22 = _mm256_mul_ps(rsq22,rinv22);
770 /* Calculate table index by multiplying r with table scale and truncate to integer */
771 rt = _mm256_mul_ps(r22,vftabscale);
772 vfitab = _mm256_cvttps_epi32(rt);
773 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
774 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
775 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
776 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
777 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
778 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
780 /* CUBIC SPLINE TABLE ELECTROSTATICS */
781 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
782 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
783 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
784 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
785 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
786 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
787 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
788 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
789 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
790 Heps = _mm256_mul_ps(vfeps,H);
791 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
792 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
793 velec = _mm256_mul_ps(qq22,VV);
794 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
795 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
797 /* Update potential sum for this i atom from the interaction with this j atom. */
798 velecsum = _mm256_add_ps(velecsum,velec);
802 /* Calculate temporary vectorial force */
803 tx = _mm256_mul_ps(fscal,dx22);
804 ty = _mm256_mul_ps(fscal,dy22);
805 tz = _mm256_mul_ps(fscal,dz22);
807 /* Update vectorial force */
808 fix2 = _mm256_add_ps(fix2,tx);
809 fiy2 = _mm256_add_ps(fiy2,ty);
810 fiz2 = _mm256_add_ps(fiz2,tz);
812 fjx2 = _mm256_add_ps(fjx2,tx);
813 fjy2 = _mm256_add_ps(fjy2,ty);
814 fjz2 = _mm256_add_ps(fjz2,tz);
816 fjptrA = f+j_coord_offsetA;
817 fjptrB = f+j_coord_offsetB;
818 fjptrC = f+j_coord_offsetC;
819 fjptrD = f+j_coord_offsetD;
820 fjptrE = f+j_coord_offsetE;
821 fjptrF = f+j_coord_offsetF;
822 fjptrG = f+j_coord_offsetG;
823 fjptrH = f+j_coord_offsetH;
825 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
826 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
828 /* Inner loop uses 417 flops */
834 /* Get j neighbor index, and coordinate index */
835 jnrlistA = jjnr[jidx];
836 jnrlistB = jjnr[jidx+1];
837 jnrlistC = jjnr[jidx+2];
838 jnrlistD = jjnr[jidx+3];
839 jnrlistE = jjnr[jidx+4];
840 jnrlistF = jjnr[jidx+5];
841 jnrlistG = jjnr[jidx+6];
842 jnrlistH = jjnr[jidx+7];
843 /* Sign of each element will be negative for non-real atoms.
844 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
845 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
847 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
848 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
850 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
851 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
852 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
853 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
854 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
855 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
856 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
857 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
858 j_coord_offsetA = DIM*jnrA;
859 j_coord_offsetB = DIM*jnrB;
860 j_coord_offsetC = DIM*jnrC;
861 j_coord_offsetD = DIM*jnrD;
862 j_coord_offsetE = DIM*jnrE;
863 j_coord_offsetF = DIM*jnrF;
864 j_coord_offsetG = DIM*jnrG;
865 j_coord_offsetH = DIM*jnrH;
867 /* load j atom coordinates */
868 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
869 x+j_coord_offsetC,x+j_coord_offsetD,
870 x+j_coord_offsetE,x+j_coord_offsetF,
871 x+j_coord_offsetG,x+j_coord_offsetH,
872 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
874 /* Calculate displacement vector */
875 dx00 = _mm256_sub_ps(ix0,jx0);
876 dy00 = _mm256_sub_ps(iy0,jy0);
877 dz00 = _mm256_sub_ps(iz0,jz0);
878 dx01 = _mm256_sub_ps(ix0,jx1);
879 dy01 = _mm256_sub_ps(iy0,jy1);
880 dz01 = _mm256_sub_ps(iz0,jz1);
881 dx02 = _mm256_sub_ps(ix0,jx2);
882 dy02 = _mm256_sub_ps(iy0,jy2);
883 dz02 = _mm256_sub_ps(iz0,jz2);
884 dx10 = _mm256_sub_ps(ix1,jx0);
885 dy10 = _mm256_sub_ps(iy1,jy0);
886 dz10 = _mm256_sub_ps(iz1,jz0);
887 dx11 = _mm256_sub_ps(ix1,jx1);
888 dy11 = _mm256_sub_ps(iy1,jy1);
889 dz11 = _mm256_sub_ps(iz1,jz1);
890 dx12 = _mm256_sub_ps(ix1,jx2);
891 dy12 = _mm256_sub_ps(iy1,jy2);
892 dz12 = _mm256_sub_ps(iz1,jz2);
893 dx20 = _mm256_sub_ps(ix2,jx0);
894 dy20 = _mm256_sub_ps(iy2,jy0);
895 dz20 = _mm256_sub_ps(iz2,jz0);
896 dx21 = _mm256_sub_ps(ix2,jx1);
897 dy21 = _mm256_sub_ps(iy2,jy1);
898 dz21 = _mm256_sub_ps(iz2,jz1);
899 dx22 = _mm256_sub_ps(ix2,jx2);
900 dy22 = _mm256_sub_ps(iy2,jy2);
901 dz22 = _mm256_sub_ps(iz2,jz2);
903 /* Calculate squared distance and things based on it */
904 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
905 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
906 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
907 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
908 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
909 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
910 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
911 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
912 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
914 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
915 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
916 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
917 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
918 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
919 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
920 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
921 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
922 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
924 fjx0 = _mm256_setzero_ps();
925 fjy0 = _mm256_setzero_ps();
926 fjz0 = _mm256_setzero_ps();
927 fjx1 = _mm256_setzero_ps();
928 fjy1 = _mm256_setzero_ps();
929 fjz1 = _mm256_setzero_ps();
930 fjx2 = _mm256_setzero_ps();
931 fjy2 = _mm256_setzero_ps();
932 fjz2 = _mm256_setzero_ps();
934 /**************************
935 * CALCULATE INTERACTIONS *
936 **************************/
938 r00 = _mm256_mul_ps(rsq00,rinv00);
939 r00 = _mm256_andnot_ps(dummy_mask,r00);
941 /* Calculate table index by multiplying r with table scale and truncate to integer */
942 rt = _mm256_mul_ps(r00,vftabscale);
943 vfitab = _mm256_cvttps_epi32(rt);
944 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
945 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
946 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
947 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
948 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
949 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
951 /* CUBIC SPLINE TABLE ELECTROSTATICS */
952 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
953 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
954 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
955 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
956 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
957 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
958 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
959 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
960 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
961 Heps = _mm256_mul_ps(vfeps,H);
962 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
963 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
964 velec = _mm256_mul_ps(qq00,VV);
965 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
966 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
968 /* CUBIC SPLINE TABLE DISPERSION */
969 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
970 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
971 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
972 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
973 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
974 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
975 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
976 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
977 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
978 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
979 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
980 Heps = _mm256_mul_ps(vfeps,H);
981 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
982 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
983 vvdw6 = _mm256_mul_ps(c6_00,VV);
984 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
985 fvdw6 = _mm256_mul_ps(c6_00,FF);
987 /* CUBIC SPLINE TABLE REPULSION */
988 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
989 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
990 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
991 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
992 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
993 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
994 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
995 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
996 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
997 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
998 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
999 Heps = _mm256_mul_ps(vfeps,H);
1000 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1001 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1002 vvdw12 = _mm256_mul_ps(c12_00,VV);
1003 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1004 fvdw12 = _mm256_mul_ps(c12_00,FF);
1005 vvdw = _mm256_add_ps(vvdw12,vvdw6);
1006 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1008 /* Update potential sum for this i atom from the interaction with this j atom. */
1009 velec = _mm256_andnot_ps(dummy_mask,velec);
1010 velecsum = _mm256_add_ps(velecsum,velec);
1011 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
1012 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
1014 fscal = _mm256_add_ps(felec,fvdw);
1016 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1018 /* Calculate temporary vectorial force */
1019 tx = _mm256_mul_ps(fscal,dx00);
1020 ty = _mm256_mul_ps(fscal,dy00);
1021 tz = _mm256_mul_ps(fscal,dz00);
1023 /* Update vectorial force */
1024 fix0 = _mm256_add_ps(fix0,tx);
1025 fiy0 = _mm256_add_ps(fiy0,ty);
1026 fiz0 = _mm256_add_ps(fiz0,tz);
1028 fjx0 = _mm256_add_ps(fjx0,tx);
1029 fjy0 = _mm256_add_ps(fjy0,ty);
1030 fjz0 = _mm256_add_ps(fjz0,tz);
1032 /**************************
1033 * CALCULATE INTERACTIONS *
1034 **************************/
1036 r01 = _mm256_mul_ps(rsq01,rinv01);
1037 r01 = _mm256_andnot_ps(dummy_mask,r01);
1039 /* Calculate table index by multiplying r with table scale and truncate to integer */
1040 rt = _mm256_mul_ps(r01,vftabscale);
1041 vfitab = _mm256_cvttps_epi32(rt);
1042 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1043 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1044 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1045 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1046 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1047 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1049 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1050 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1051 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1052 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1053 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1054 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1055 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1056 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1057 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1058 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1059 Heps = _mm256_mul_ps(vfeps,H);
1060 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1061 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1062 velec = _mm256_mul_ps(qq01,VV);
1063 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1064 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1066 /* Update potential sum for this i atom from the interaction with this j atom. */
1067 velec = _mm256_andnot_ps(dummy_mask,velec);
1068 velecsum = _mm256_add_ps(velecsum,velec);
1072 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1074 /* Calculate temporary vectorial force */
1075 tx = _mm256_mul_ps(fscal,dx01);
1076 ty = _mm256_mul_ps(fscal,dy01);
1077 tz = _mm256_mul_ps(fscal,dz01);
1079 /* Update vectorial force */
1080 fix0 = _mm256_add_ps(fix0,tx);
1081 fiy0 = _mm256_add_ps(fiy0,ty);
1082 fiz0 = _mm256_add_ps(fiz0,tz);
1084 fjx1 = _mm256_add_ps(fjx1,tx);
1085 fjy1 = _mm256_add_ps(fjy1,ty);
1086 fjz1 = _mm256_add_ps(fjz1,tz);
1088 /**************************
1089 * CALCULATE INTERACTIONS *
1090 **************************/
1092 r02 = _mm256_mul_ps(rsq02,rinv02);
1093 r02 = _mm256_andnot_ps(dummy_mask,r02);
1095 /* Calculate table index by multiplying r with table scale and truncate to integer */
1096 rt = _mm256_mul_ps(r02,vftabscale);
1097 vfitab = _mm256_cvttps_epi32(rt);
1098 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1099 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1100 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1101 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1102 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1103 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1105 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1106 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1107 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1108 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1109 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1110 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1111 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1112 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1113 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1114 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1115 Heps = _mm256_mul_ps(vfeps,H);
1116 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1117 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1118 velec = _mm256_mul_ps(qq02,VV);
1119 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1120 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1122 /* Update potential sum for this i atom from the interaction with this j atom. */
1123 velec = _mm256_andnot_ps(dummy_mask,velec);
1124 velecsum = _mm256_add_ps(velecsum,velec);
1128 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1130 /* Calculate temporary vectorial force */
1131 tx = _mm256_mul_ps(fscal,dx02);
1132 ty = _mm256_mul_ps(fscal,dy02);
1133 tz = _mm256_mul_ps(fscal,dz02);
1135 /* Update vectorial force */
1136 fix0 = _mm256_add_ps(fix0,tx);
1137 fiy0 = _mm256_add_ps(fiy0,ty);
1138 fiz0 = _mm256_add_ps(fiz0,tz);
1140 fjx2 = _mm256_add_ps(fjx2,tx);
1141 fjy2 = _mm256_add_ps(fjy2,ty);
1142 fjz2 = _mm256_add_ps(fjz2,tz);
1144 /**************************
1145 * CALCULATE INTERACTIONS *
1146 **************************/
1148 r10 = _mm256_mul_ps(rsq10,rinv10);
1149 r10 = _mm256_andnot_ps(dummy_mask,r10);
1151 /* Calculate table index by multiplying r with table scale and truncate to integer */
1152 rt = _mm256_mul_ps(r10,vftabscale);
1153 vfitab = _mm256_cvttps_epi32(rt);
1154 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1155 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1156 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1157 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1158 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1159 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1161 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1162 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1163 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1164 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1165 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1166 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1167 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1168 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1169 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1170 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1171 Heps = _mm256_mul_ps(vfeps,H);
1172 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1173 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1174 velec = _mm256_mul_ps(qq10,VV);
1175 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1176 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1178 /* Update potential sum for this i atom from the interaction with this j atom. */
1179 velec = _mm256_andnot_ps(dummy_mask,velec);
1180 velecsum = _mm256_add_ps(velecsum,velec);
1184 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1186 /* Calculate temporary vectorial force */
1187 tx = _mm256_mul_ps(fscal,dx10);
1188 ty = _mm256_mul_ps(fscal,dy10);
1189 tz = _mm256_mul_ps(fscal,dz10);
1191 /* Update vectorial force */
1192 fix1 = _mm256_add_ps(fix1,tx);
1193 fiy1 = _mm256_add_ps(fiy1,ty);
1194 fiz1 = _mm256_add_ps(fiz1,tz);
1196 fjx0 = _mm256_add_ps(fjx0,tx);
1197 fjy0 = _mm256_add_ps(fjy0,ty);
1198 fjz0 = _mm256_add_ps(fjz0,tz);
1200 /**************************
1201 * CALCULATE INTERACTIONS *
1202 **************************/
1204 r11 = _mm256_mul_ps(rsq11,rinv11);
1205 r11 = _mm256_andnot_ps(dummy_mask,r11);
1207 /* Calculate table index by multiplying r with table scale and truncate to integer */
1208 rt = _mm256_mul_ps(r11,vftabscale);
1209 vfitab = _mm256_cvttps_epi32(rt);
1210 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1211 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1212 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1213 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1214 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1215 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1217 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1218 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1219 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1220 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1221 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1222 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1223 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1224 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1225 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1226 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1227 Heps = _mm256_mul_ps(vfeps,H);
1228 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1229 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1230 velec = _mm256_mul_ps(qq11,VV);
1231 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1232 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1234 /* Update potential sum for this i atom from the interaction with this j atom. */
1235 velec = _mm256_andnot_ps(dummy_mask,velec);
1236 velecsum = _mm256_add_ps(velecsum,velec);
1240 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1242 /* Calculate temporary vectorial force */
1243 tx = _mm256_mul_ps(fscal,dx11);
1244 ty = _mm256_mul_ps(fscal,dy11);
1245 tz = _mm256_mul_ps(fscal,dz11);
1247 /* Update vectorial force */
1248 fix1 = _mm256_add_ps(fix1,tx);
1249 fiy1 = _mm256_add_ps(fiy1,ty);
1250 fiz1 = _mm256_add_ps(fiz1,tz);
1252 fjx1 = _mm256_add_ps(fjx1,tx);
1253 fjy1 = _mm256_add_ps(fjy1,ty);
1254 fjz1 = _mm256_add_ps(fjz1,tz);
1256 /**************************
1257 * CALCULATE INTERACTIONS *
1258 **************************/
1260 r12 = _mm256_mul_ps(rsq12,rinv12);
1261 r12 = _mm256_andnot_ps(dummy_mask,r12);
1263 /* Calculate table index by multiplying r with table scale and truncate to integer */
1264 rt = _mm256_mul_ps(r12,vftabscale);
1265 vfitab = _mm256_cvttps_epi32(rt);
1266 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1267 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1268 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1269 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1270 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1271 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1273 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1274 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1275 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1276 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1277 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1278 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1279 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1280 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1281 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1282 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1283 Heps = _mm256_mul_ps(vfeps,H);
1284 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1285 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1286 velec = _mm256_mul_ps(qq12,VV);
1287 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1288 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1290 /* Update potential sum for this i atom from the interaction with this j atom. */
1291 velec = _mm256_andnot_ps(dummy_mask,velec);
1292 velecsum = _mm256_add_ps(velecsum,velec);
1296 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1298 /* Calculate temporary vectorial force */
1299 tx = _mm256_mul_ps(fscal,dx12);
1300 ty = _mm256_mul_ps(fscal,dy12);
1301 tz = _mm256_mul_ps(fscal,dz12);
1303 /* Update vectorial force */
1304 fix1 = _mm256_add_ps(fix1,tx);
1305 fiy1 = _mm256_add_ps(fiy1,ty);
1306 fiz1 = _mm256_add_ps(fiz1,tz);
1308 fjx2 = _mm256_add_ps(fjx2,tx);
1309 fjy2 = _mm256_add_ps(fjy2,ty);
1310 fjz2 = _mm256_add_ps(fjz2,tz);
1312 /**************************
1313 * CALCULATE INTERACTIONS *
1314 **************************/
1316 r20 = _mm256_mul_ps(rsq20,rinv20);
1317 r20 = _mm256_andnot_ps(dummy_mask,r20);
1319 /* Calculate table index by multiplying r with table scale and truncate to integer */
1320 rt = _mm256_mul_ps(r20,vftabscale);
1321 vfitab = _mm256_cvttps_epi32(rt);
1322 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1323 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1324 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1325 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1326 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1327 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1329 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1330 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1331 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1332 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1333 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1334 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1335 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1336 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1337 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1338 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1339 Heps = _mm256_mul_ps(vfeps,H);
1340 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1341 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1342 velec = _mm256_mul_ps(qq20,VV);
1343 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1344 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1346 /* Update potential sum for this i atom from the interaction with this j atom. */
1347 velec = _mm256_andnot_ps(dummy_mask,velec);
1348 velecsum = _mm256_add_ps(velecsum,velec);
1352 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1354 /* Calculate temporary vectorial force */
1355 tx = _mm256_mul_ps(fscal,dx20);
1356 ty = _mm256_mul_ps(fscal,dy20);
1357 tz = _mm256_mul_ps(fscal,dz20);
1359 /* Update vectorial force */
1360 fix2 = _mm256_add_ps(fix2,tx);
1361 fiy2 = _mm256_add_ps(fiy2,ty);
1362 fiz2 = _mm256_add_ps(fiz2,tz);
1364 fjx0 = _mm256_add_ps(fjx0,tx);
1365 fjy0 = _mm256_add_ps(fjy0,ty);
1366 fjz0 = _mm256_add_ps(fjz0,tz);
1368 /**************************
1369 * CALCULATE INTERACTIONS *
1370 **************************/
1372 r21 = _mm256_mul_ps(rsq21,rinv21);
1373 r21 = _mm256_andnot_ps(dummy_mask,r21);
1375 /* Calculate table index by multiplying r with table scale and truncate to integer */
1376 rt = _mm256_mul_ps(r21,vftabscale);
1377 vfitab = _mm256_cvttps_epi32(rt);
1378 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1379 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1380 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1381 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1382 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1383 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1385 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1386 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1387 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1388 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1389 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1390 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1391 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1392 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1393 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1394 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1395 Heps = _mm256_mul_ps(vfeps,H);
1396 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1397 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1398 velec = _mm256_mul_ps(qq21,VV);
1399 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1400 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1402 /* Update potential sum for this i atom from the interaction with this j atom. */
1403 velec = _mm256_andnot_ps(dummy_mask,velec);
1404 velecsum = _mm256_add_ps(velecsum,velec);
1408 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1410 /* Calculate temporary vectorial force */
1411 tx = _mm256_mul_ps(fscal,dx21);
1412 ty = _mm256_mul_ps(fscal,dy21);
1413 tz = _mm256_mul_ps(fscal,dz21);
1415 /* Update vectorial force */
1416 fix2 = _mm256_add_ps(fix2,tx);
1417 fiy2 = _mm256_add_ps(fiy2,ty);
1418 fiz2 = _mm256_add_ps(fiz2,tz);
1420 fjx1 = _mm256_add_ps(fjx1,tx);
1421 fjy1 = _mm256_add_ps(fjy1,ty);
1422 fjz1 = _mm256_add_ps(fjz1,tz);
1424 /**************************
1425 * CALCULATE INTERACTIONS *
1426 **************************/
1428 r22 = _mm256_mul_ps(rsq22,rinv22);
1429 r22 = _mm256_andnot_ps(dummy_mask,r22);
1431 /* Calculate table index by multiplying r with table scale and truncate to integer */
1432 rt = _mm256_mul_ps(r22,vftabscale);
1433 vfitab = _mm256_cvttps_epi32(rt);
1434 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1435 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1436 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1437 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1438 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1439 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1441 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1442 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1443 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1444 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1445 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1446 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1447 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1448 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1449 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1450 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1451 Heps = _mm256_mul_ps(vfeps,H);
1452 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1453 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1454 velec = _mm256_mul_ps(qq22,VV);
1455 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1456 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1458 /* Update potential sum for this i atom from the interaction with this j atom. */
1459 velec = _mm256_andnot_ps(dummy_mask,velec);
1460 velecsum = _mm256_add_ps(velecsum,velec);
1464 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1466 /* Calculate temporary vectorial force */
1467 tx = _mm256_mul_ps(fscal,dx22);
1468 ty = _mm256_mul_ps(fscal,dy22);
1469 tz = _mm256_mul_ps(fscal,dz22);
1471 /* Update vectorial force */
1472 fix2 = _mm256_add_ps(fix2,tx);
1473 fiy2 = _mm256_add_ps(fiy2,ty);
1474 fiz2 = _mm256_add_ps(fiz2,tz);
1476 fjx2 = _mm256_add_ps(fjx2,tx);
1477 fjy2 = _mm256_add_ps(fjy2,ty);
1478 fjz2 = _mm256_add_ps(fjz2,tz);
1480 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1481 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1482 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1483 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1484 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1485 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1486 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1487 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1489 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1490 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1492 /* Inner loop uses 426 flops */
1495 /* End of innermost loop */
1497 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1498 f+i_coord_offset,fshift+i_shift_offset);
1501 /* Update potential energies */
1502 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1503 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1505 /* Increment number of inner iterations */
1506 inneriter += j_index_end - j_index_start;
1508 /* Outer loop uses 20 flops */
1511 /* Increment number of outer iterations */
1514 /* Update outer/inner flops */
1516 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*426);
1519 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single
1520 * Electrostatics interaction: CubicSplineTable
1521 * VdW interaction: CubicSplineTable
1522 * Geometry: Water3-Water3
1523 * Calculate force/pot: Force
1526 nb_kernel_ElecCSTab_VdwCSTab_GeomW3W3_F_avx_256_single
1527 (t_nblist * gmx_restrict nlist,
1528 rvec * gmx_restrict xx,
1529 rvec * gmx_restrict ff,
1530 t_forcerec * gmx_restrict fr,
1531 t_mdatoms * gmx_restrict mdatoms,
1532 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1533 t_nrnb * gmx_restrict nrnb)
1535 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1536 * just 0 for non-waters.
1537 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1538 * jnr indices corresponding to data put in the four positions in the SIMD register.
1540 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1541 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1542 int jnrA,jnrB,jnrC,jnrD;
1543 int jnrE,jnrF,jnrG,jnrH;
1544 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1545 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1546 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1547 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1548 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1549 real rcutoff_scalar;
1550 real *shiftvec,*fshift,*x,*f;
1551 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1552 real scratch[4*DIM];
1553 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1554 real * vdwioffsetptr0;
1555 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1556 real * vdwioffsetptr1;
1557 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1558 real * vdwioffsetptr2;
1559 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1560 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1561 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1562 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1563 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1564 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1565 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1566 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1567 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1568 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1569 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1570 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1571 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1572 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1573 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1574 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1575 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1578 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1581 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1582 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1584 __m128i vfitab_lo,vfitab_hi;
1585 __m128i ifour = _mm_set1_epi32(4);
1586 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1588 __m256 dummy_mask,cutoff_mask;
1589 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1590 __m256 one = _mm256_set1_ps(1.0);
1591 __m256 two = _mm256_set1_ps(2.0);
1597 jindex = nlist->jindex;
1599 shiftidx = nlist->shift;
1601 shiftvec = fr->shift_vec[0];
1602 fshift = fr->fshift[0];
1603 facel = _mm256_set1_ps(fr->epsfac);
1604 charge = mdatoms->chargeA;
1605 nvdwtype = fr->ntype;
1606 vdwparam = fr->nbfp;
1607 vdwtype = mdatoms->typeA;
1609 vftab = kernel_data->table_elec_vdw->data;
1610 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
1612 /* Setup water-specific parameters */
1613 inr = nlist->iinr[0];
1614 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1615 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1616 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1617 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1619 jq0 = _mm256_set1_ps(charge[inr+0]);
1620 jq1 = _mm256_set1_ps(charge[inr+1]);
1621 jq2 = _mm256_set1_ps(charge[inr+2]);
1622 vdwjidx0A = 2*vdwtype[inr+0];
1623 qq00 = _mm256_mul_ps(iq0,jq0);
1624 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1625 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1626 qq01 = _mm256_mul_ps(iq0,jq1);
1627 qq02 = _mm256_mul_ps(iq0,jq2);
1628 qq10 = _mm256_mul_ps(iq1,jq0);
1629 qq11 = _mm256_mul_ps(iq1,jq1);
1630 qq12 = _mm256_mul_ps(iq1,jq2);
1631 qq20 = _mm256_mul_ps(iq2,jq0);
1632 qq21 = _mm256_mul_ps(iq2,jq1);
1633 qq22 = _mm256_mul_ps(iq2,jq2);
1635 /* Avoid stupid compiler warnings */
1636 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1637 j_coord_offsetA = 0;
1638 j_coord_offsetB = 0;
1639 j_coord_offsetC = 0;
1640 j_coord_offsetD = 0;
1641 j_coord_offsetE = 0;
1642 j_coord_offsetF = 0;
1643 j_coord_offsetG = 0;
1644 j_coord_offsetH = 0;
1649 for(iidx=0;iidx<4*DIM;iidx++)
1651 scratch[iidx] = 0.0;
1654 /* Start outer loop over neighborlists */
1655 for(iidx=0; iidx<nri; iidx++)
1657 /* Load shift vector for this list */
1658 i_shift_offset = DIM*shiftidx[iidx];
1660 /* Load limits for loop over neighbors */
1661 j_index_start = jindex[iidx];
1662 j_index_end = jindex[iidx+1];
1664 /* Get outer coordinate index */
1666 i_coord_offset = DIM*inr;
1668 /* Load i particle coords and add shift vector */
1669 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1670 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1672 fix0 = _mm256_setzero_ps();
1673 fiy0 = _mm256_setzero_ps();
1674 fiz0 = _mm256_setzero_ps();
1675 fix1 = _mm256_setzero_ps();
1676 fiy1 = _mm256_setzero_ps();
1677 fiz1 = _mm256_setzero_ps();
1678 fix2 = _mm256_setzero_ps();
1679 fiy2 = _mm256_setzero_ps();
1680 fiz2 = _mm256_setzero_ps();
1682 /* Start inner kernel loop */
1683 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1686 /* Get j neighbor index, and coordinate index */
1688 jnrB = jjnr[jidx+1];
1689 jnrC = jjnr[jidx+2];
1690 jnrD = jjnr[jidx+3];
1691 jnrE = jjnr[jidx+4];
1692 jnrF = jjnr[jidx+5];
1693 jnrG = jjnr[jidx+6];
1694 jnrH = jjnr[jidx+7];
1695 j_coord_offsetA = DIM*jnrA;
1696 j_coord_offsetB = DIM*jnrB;
1697 j_coord_offsetC = DIM*jnrC;
1698 j_coord_offsetD = DIM*jnrD;
1699 j_coord_offsetE = DIM*jnrE;
1700 j_coord_offsetF = DIM*jnrF;
1701 j_coord_offsetG = DIM*jnrG;
1702 j_coord_offsetH = DIM*jnrH;
1704 /* load j atom coordinates */
1705 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1706 x+j_coord_offsetC,x+j_coord_offsetD,
1707 x+j_coord_offsetE,x+j_coord_offsetF,
1708 x+j_coord_offsetG,x+j_coord_offsetH,
1709 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1711 /* Calculate displacement vector */
1712 dx00 = _mm256_sub_ps(ix0,jx0);
1713 dy00 = _mm256_sub_ps(iy0,jy0);
1714 dz00 = _mm256_sub_ps(iz0,jz0);
1715 dx01 = _mm256_sub_ps(ix0,jx1);
1716 dy01 = _mm256_sub_ps(iy0,jy1);
1717 dz01 = _mm256_sub_ps(iz0,jz1);
1718 dx02 = _mm256_sub_ps(ix0,jx2);
1719 dy02 = _mm256_sub_ps(iy0,jy2);
1720 dz02 = _mm256_sub_ps(iz0,jz2);
1721 dx10 = _mm256_sub_ps(ix1,jx0);
1722 dy10 = _mm256_sub_ps(iy1,jy0);
1723 dz10 = _mm256_sub_ps(iz1,jz0);
1724 dx11 = _mm256_sub_ps(ix1,jx1);
1725 dy11 = _mm256_sub_ps(iy1,jy1);
1726 dz11 = _mm256_sub_ps(iz1,jz1);
1727 dx12 = _mm256_sub_ps(ix1,jx2);
1728 dy12 = _mm256_sub_ps(iy1,jy2);
1729 dz12 = _mm256_sub_ps(iz1,jz2);
1730 dx20 = _mm256_sub_ps(ix2,jx0);
1731 dy20 = _mm256_sub_ps(iy2,jy0);
1732 dz20 = _mm256_sub_ps(iz2,jz0);
1733 dx21 = _mm256_sub_ps(ix2,jx1);
1734 dy21 = _mm256_sub_ps(iy2,jy1);
1735 dz21 = _mm256_sub_ps(iz2,jz1);
1736 dx22 = _mm256_sub_ps(ix2,jx2);
1737 dy22 = _mm256_sub_ps(iy2,jy2);
1738 dz22 = _mm256_sub_ps(iz2,jz2);
1740 /* Calculate squared distance and things based on it */
1741 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1742 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1743 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1744 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1745 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1746 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1747 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1748 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1749 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1751 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1752 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1753 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1754 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1755 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1756 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1757 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1758 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1759 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1761 fjx0 = _mm256_setzero_ps();
1762 fjy0 = _mm256_setzero_ps();
1763 fjz0 = _mm256_setzero_ps();
1764 fjx1 = _mm256_setzero_ps();
1765 fjy1 = _mm256_setzero_ps();
1766 fjz1 = _mm256_setzero_ps();
1767 fjx2 = _mm256_setzero_ps();
1768 fjy2 = _mm256_setzero_ps();
1769 fjz2 = _mm256_setzero_ps();
1771 /**************************
1772 * CALCULATE INTERACTIONS *
1773 **************************/
1775 r00 = _mm256_mul_ps(rsq00,rinv00);
1777 /* Calculate table index by multiplying r with table scale and truncate to integer */
1778 rt = _mm256_mul_ps(r00,vftabscale);
1779 vfitab = _mm256_cvttps_epi32(rt);
1780 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1781 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1782 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1783 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1784 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1785 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1787 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1788 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1789 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1790 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1791 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1792 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1793 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1794 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1795 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1796 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1797 Heps = _mm256_mul_ps(vfeps,H);
1798 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1799 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1800 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1802 /* CUBIC SPLINE TABLE DISPERSION */
1803 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1804 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1805 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1806 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1807 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1808 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1809 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1810 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1811 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1812 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1813 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1814 Heps = _mm256_mul_ps(vfeps,H);
1815 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1816 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1817 fvdw6 = _mm256_mul_ps(c6_00,FF);
1819 /* CUBIC SPLINE TABLE REPULSION */
1820 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1821 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1822 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1823 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1824 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1825 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1826 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1827 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1828 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1829 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1830 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1831 Heps = _mm256_mul_ps(vfeps,H);
1832 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1833 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1834 fvdw12 = _mm256_mul_ps(c12_00,FF);
1835 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1837 fscal = _mm256_add_ps(felec,fvdw);
1839 /* Calculate temporary vectorial force */
1840 tx = _mm256_mul_ps(fscal,dx00);
1841 ty = _mm256_mul_ps(fscal,dy00);
1842 tz = _mm256_mul_ps(fscal,dz00);
1844 /* Update vectorial force */
1845 fix0 = _mm256_add_ps(fix0,tx);
1846 fiy0 = _mm256_add_ps(fiy0,ty);
1847 fiz0 = _mm256_add_ps(fiz0,tz);
1849 fjx0 = _mm256_add_ps(fjx0,tx);
1850 fjy0 = _mm256_add_ps(fjy0,ty);
1851 fjz0 = _mm256_add_ps(fjz0,tz);
1853 /**************************
1854 * CALCULATE INTERACTIONS *
1855 **************************/
1857 r01 = _mm256_mul_ps(rsq01,rinv01);
1859 /* Calculate table index by multiplying r with table scale and truncate to integer */
1860 rt = _mm256_mul_ps(r01,vftabscale);
1861 vfitab = _mm256_cvttps_epi32(rt);
1862 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1863 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1864 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1865 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1866 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1867 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1869 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1870 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1871 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1872 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1873 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1874 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1875 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1876 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1877 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1878 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1879 Heps = _mm256_mul_ps(vfeps,H);
1880 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1881 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1882 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1886 /* Calculate temporary vectorial force */
1887 tx = _mm256_mul_ps(fscal,dx01);
1888 ty = _mm256_mul_ps(fscal,dy01);
1889 tz = _mm256_mul_ps(fscal,dz01);
1891 /* Update vectorial force */
1892 fix0 = _mm256_add_ps(fix0,tx);
1893 fiy0 = _mm256_add_ps(fiy0,ty);
1894 fiz0 = _mm256_add_ps(fiz0,tz);
1896 fjx1 = _mm256_add_ps(fjx1,tx);
1897 fjy1 = _mm256_add_ps(fjy1,ty);
1898 fjz1 = _mm256_add_ps(fjz1,tz);
1900 /**************************
1901 * CALCULATE INTERACTIONS *
1902 **************************/
1904 r02 = _mm256_mul_ps(rsq02,rinv02);
1906 /* Calculate table index by multiplying r with table scale and truncate to integer */
1907 rt = _mm256_mul_ps(r02,vftabscale);
1908 vfitab = _mm256_cvttps_epi32(rt);
1909 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1910 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1911 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1912 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1913 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1914 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1916 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1917 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1918 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1919 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1920 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1921 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1922 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1923 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1924 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1925 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1926 Heps = _mm256_mul_ps(vfeps,H);
1927 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1928 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1929 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1933 /* Calculate temporary vectorial force */
1934 tx = _mm256_mul_ps(fscal,dx02);
1935 ty = _mm256_mul_ps(fscal,dy02);
1936 tz = _mm256_mul_ps(fscal,dz02);
1938 /* Update vectorial force */
1939 fix0 = _mm256_add_ps(fix0,tx);
1940 fiy0 = _mm256_add_ps(fiy0,ty);
1941 fiz0 = _mm256_add_ps(fiz0,tz);
1943 fjx2 = _mm256_add_ps(fjx2,tx);
1944 fjy2 = _mm256_add_ps(fjy2,ty);
1945 fjz2 = _mm256_add_ps(fjz2,tz);
1947 /**************************
1948 * CALCULATE INTERACTIONS *
1949 **************************/
1951 r10 = _mm256_mul_ps(rsq10,rinv10);
1953 /* Calculate table index by multiplying r with table scale and truncate to integer */
1954 rt = _mm256_mul_ps(r10,vftabscale);
1955 vfitab = _mm256_cvttps_epi32(rt);
1956 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1957 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1958 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1959 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1960 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1961 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1963 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1964 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1965 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1966 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1967 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1968 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1969 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1970 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1971 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1972 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1973 Heps = _mm256_mul_ps(vfeps,H);
1974 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1975 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1976 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1980 /* Calculate temporary vectorial force */
1981 tx = _mm256_mul_ps(fscal,dx10);
1982 ty = _mm256_mul_ps(fscal,dy10);
1983 tz = _mm256_mul_ps(fscal,dz10);
1985 /* Update vectorial force */
1986 fix1 = _mm256_add_ps(fix1,tx);
1987 fiy1 = _mm256_add_ps(fiy1,ty);
1988 fiz1 = _mm256_add_ps(fiz1,tz);
1990 fjx0 = _mm256_add_ps(fjx0,tx);
1991 fjy0 = _mm256_add_ps(fjy0,ty);
1992 fjz0 = _mm256_add_ps(fjz0,tz);
1994 /**************************
1995 * CALCULATE INTERACTIONS *
1996 **************************/
1998 r11 = _mm256_mul_ps(rsq11,rinv11);
2000 /* Calculate table index by multiplying r with table scale and truncate to integer */
2001 rt = _mm256_mul_ps(r11,vftabscale);
2002 vfitab = _mm256_cvttps_epi32(rt);
2003 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2004 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2005 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2006 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2007 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2008 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2010 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2011 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2012 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2013 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2014 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2015 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2016 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2017 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2018 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2019 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2020 Heps = _mm256_mul_ps(vfeps,H);
2021 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2022 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2023 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2027 /* Calculate temporary vectorial force */
2028 tx = _mm256_mul_ps(fscal,dx11);
2029 ty = _mm256_mul_ps(fscal,dy11);
2030 tz = _mm256_mul_ps(fscal,dz11);
2032 /* Update vectorial force */
2033 fix1 = _mm256_add_ps(fix1,tx);
2034 fiy1 = _mm256_add_ps(fiy1,ty);
2035 fiz1 = _mm256_add_ps(fiz1,tz);
2037 fjx1 = _mm256_add_ps(fjx1,tx);
2038 fjy1 = _mm256_add_ps(fjy1,ty);
2039 fjz1 = _mm256_add_ps(fjz1,tz);
2041 /**************************
2042 * CALCULATE INTERACTIONS *
2043 **************************/
2045 r12 = _mm256_mul_ps(rsq12,rinv12);
2047 /* Calculate table index by multiplying r with table scale and truncate to integer */
2048 rt = _mm256_mul_ps(r12,vftabscale);
2049 vfitab = _mm256_cvttps_epi32(rt);
2050 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2051 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2052 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2053 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2054 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2055 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2057 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2058 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2059 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2060 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2061 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2062 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2063 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2064 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2065 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2066 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2067 Heps = _mm256_mul_ps(vfeps,H);
2068 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2069 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2070 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2074 /* Calculate temporary vectorial force */
2075 tx = _mm256_mul_ps(fscal,dx12);
2076 ty = _mm256_mul_ps(fscal,dy12);
2077 tz = _mm256_mul_ps(fscal,dz12);
2079 /* Update vectorial force */
2080 fix1 = _mm256_add_ps(fix1,tx);
2081 fiy1 = _mm256_add_ps(fiy1,ty);
2082 fiz1 = _mm256_add_ps(fiz1,tz);
2084 fjx2 = _mm256_add_ps(fjx2,tx);
2085 fjy2 = _mm256_add_ps(fjy2,ty);
2086 fjz2 = _mm256_add_ps(fjz2,tz);
2088 /**************************
2089 * CALCULATE INTERACTIONS *
2090 **************************/
2092 r20 = _mm256_mul_ps(rsq20,rinv20);
2094 /* Calculate table index by multiplying r with table scale and truncate to integer */
2095 rt = _mm256_mul_ps(r20,vftabscale);
2096 vfitab = _mm256_cvttps_epi32(rt);
2097 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2098 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2099 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2100 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2101 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2102 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2104 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2105 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2106 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2107 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2108 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2109 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2110 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2111 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2112 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2113 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2114 Heps = _mm256_mul_ps(vfeps,H);
2115 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2116 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2117 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2121 /* Calculate temporary vectorial force */
2122 tx = _mm256_mul_ps(fscal,dx20);
2123 ty = _mm256_mul_ps(fscal,dy20);
2124 tz = _mm256_mul_ps(fscal,dz20);
2126 /* Update vectorial force */
2127 fix2 = _mm256_add_ps(fix2,tx);
2128 fiy2 = _mm256_add_ps(fiy2,ty);
2129 fiz2 = _mm256_add_ps(fiz2,tz);
2131 fjx0 = _mm256_add_ps(fjx0,tx);
2132 fjy0 = _mm256_add_ps(fjy0,ty);
2133 fjz0 = _mm256_add_ps(fjz0,tz);
2135 /**************************
2136 * CALCULATE INTERACTIONS *
2137 **************************/
2139 r21 = _mm256_mul_ps(rsq21,rinv21);
2141 /* Calculate table index by multiplying r with table scale and truncate to integer */
2142 rt = _mm256_mul_ps(r21,vftabscale);
2143 vfitab = _mm256_cvttps_epi32(rt);
2144 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2145 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2146 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2147 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2148 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2149 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2151 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2152 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2153 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2154 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2155 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2156 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2157 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2158 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2159 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2160 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2161 Heps = _mm256_mul_ps(vfeps,H);
2162 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2163 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2164 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2168 /* Calculate temporary vectorial force */
2169 tx = _mm256_mul_ps(fscal,dx21);
2170 ty = _mm256_mul_ps(fscal,dy21);
2171 tz = _mm256_mul_ps(fscal,dz21);
2173 /* Update vectorial force */
2174 fix2 = _mm256_add_ps(fix2,tx);
2175 fiy2 = _mm256_add_ps(fiy2,ty);
2176 fiz2 = _mm256_add_ps(fiz2,tz);
2178 fjx1 = _mm256_add_ps(fjx1,tx);
2179 fjy1 = _mm256_add_ps(fjy1,ty);
2180 fjz1 = _mm256_add_ps(fjz1,tz);
2182 /**************************
2183 * CALCULATE INTERACTIONS *
2184 **************************/
2186 r22 = _mm256_mul_ps(rsq22,rinv22);
2188 /* Calculate table index by multiplying r with table scale and truncate to integer */
2189 rt = _mm256_mul_ps(r22,vftabscale);
2190 vfitab = _mm256_cvttps_epi32(rt);
2191 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2192 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2193 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2194 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2195 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2196 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2198 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2199 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2200 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2201 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2202 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2203 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2204 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2205 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2206 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2207 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2208 Heps = _mm256_mul_ps(vfeps,H);
2209 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2210 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2211 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2215 /* Calculate temporary vectorial force */
2216 tx = _mm256_mul_ps(fscal,dx22);
2217 ty = _mm256_mul_ps(fscal,dy22);
2218 tz = _mm256_mul_ps(fscal,dz22);
2220 /* Update vectorial force */
2221 fix2 = _mm256_add_ps(fix2,tx);
2222 fiy2 = _mm256_add_ps(fiy2,ty);
2223 fiz2 = _mm256_add_ps(fiz2,tz);
2225 fjx2 = _mm256_add_ps(fjx2,tx);
2226 fjy2 = _mm256_add_ps(fjy2,ty);
2227 fjz2 = _mm256_add_ps(fjz2,tz);
2229 fjptrA = f+j_coord_offsetA;
2230 fjptrB = f+j_coord_offsetB;
2231 fjptrC = f+j_coord_offsetC;
2232 fjptrD = f+j_coord_offsetD;
2233 fjptrE = f+j_coord_offsetE;
2234 fjptrF = f+j_coord_offsetF;
2235 fjptrG = f+j_coord_offsetG;
2236 fjptrH = f+j_coord_offsetH;
2238 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2239 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2241 /* Inner loop uses 373 flops */
2244 if(jidx<j_index_end)
2247 /* Get j neighbor index, and coordinate index */
2248 jnrlistA = jjnr[jidx];
2249 jnrlistB = jjnr[jidx+1];
2250 jnrlistC = jjnr[jidx+2];
2251 jnrlistD = jjnr[jidx+3];
2252 jnrlistE = jjnr[jidx+4];
2253 jnrlistF = jjnr[jidx+5];
2254 jnrlistG = jjnr[jidx+6];
2255 jnrlistH = jjnr[jidx+7];
2256 /* Sign of each element will be negative for non-real atoms.
2257 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2258 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2260 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2261 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2263 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2264 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2265 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2266 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2267 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2268 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2269 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2270 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2271 j_coord_offsetA = DIM*jnrA;
2272 j_coord_offsetB = DIM*jnrB;
2273 j_coord_offsetC = DIM*jnrC;
2274 j_coord_offsetD = DIM*jnrD;
2275 j_coord_offsetE = DIM*jnrE;
2276 j_coord_offsetF = DIM*jnrF;
2277 j_coord_offsetG = DIM*jnrG;
2278 j_coord_offsetH = DIM*jnrH;
2280 /* load j atom coordinates */
2281 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2282 x+j_coord_offsetC,x+j_coord_offsetD,
2283 x+j_coord_offsetE,x+j_coord_offsetF,
2284 x+j_coord_offsetG,x+j_coord_offsetH,
2285 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2287 /* Calculate displacement vector */
2288 dx00 = _mm256_sub_ps(ix0,jx0);
2289 dy00 = _mm256_sub_ps(iy0,jy0);
2290 dz00 = _mm256_sub_ps(iz0,jz0);
2291 dx01 = _mm256_sub_ps(ix0,jx1);
2292 dy01 = _mm256_sub_ps(iy0,jy1);
2293 dz01 = _mm256_sub_ps(iz0,jz1);
2294 dx02 = _mm256_sub_ps(ix0,jx2);
2295 dy02 = _mm256_sub_ps(iy0,jy2);
2296 dz02 = _mm256_sub_ps(iz0,jz2);
2297 dx10 = _mm256_sub_ps(ix1,jx0);
2298 dy10 = _mm256_sub_ps(iy1,jy0);
2299 dz10 = _mm256_sub_ps(iz1,jz0);
2300 dx11 = _mm256_sub_ps(ix1,jx1);
2301 dy11 = _mm256_sub_ps(iy1,jy1);
2302 dz11 = _mm256_sub_ps(iz1,jz1);
2303 dx12 = _mm256_sub_ps(ix1,jx2);
2304 dy12 = _mm256_sub_ps(iy1,jy2);
2305 dz12 = _mm256_sub_ps(iz1,jz2);
2306 dx20 = _mm256_sub_ps(ix2,jx0);
2307 dy20 = _mm256_sub_ps(iy2,jy0);
2308 dz20 = _mm256_sub_ps(iz2,jz0);
2309 dx21 = _mm256_sub_ps(ix2,jx1);
2310 dy21 = _mm256_sub_ps(iy2,jy1);
2311 dz21 = _mm256_sub_ps(iz2,jz1);
2312 dx22 = _mm256_sub_ps(ix2,jx2);
2313 dy22 = _mm256_sub_ps(iy2,jy2);
2314 dz22 = _mm256_sub_ps(iz2,jz2);
2316 /* Calculate squared distance and things based on it */
2317 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2318 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2319 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2320 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2321 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2322 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2323 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2324 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2325 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2327 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2328 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
2329 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
2330 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
2331 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2332 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2333 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
2334 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2335 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2337 fjx0 = _mm256_setzero_ps();
2338 fjy0 = _mm256_setzero_ps();
2339 fjz0 = _mm256_setzero_ps();
2340 fjx1 = _mm256_setzero_ps();
2341 fjy1 = _mm256_setzero_ps();
2342 fjz1 = _mm256_setzero_ps();
2343 fjx2 = _mm256_setzero_ps();
2344 fjy2 = _mm256_setzero_ps();
2345 fjz2 = _mm256_setzero_ps();
2347 /**************************
2348 * CALCULATE INTERACTIONS *
2349 **************************/
2351 r00 = _mm256_mul_ps(rsq00,rinv00);
2352 r00 = _mm256_andnot_ps(dummy_mask,r00);
2354 /* Calculate table index by multiplying r with table scale and truncate to integer */
2355 rt = _mm256_mul_ps(r00,vftabscale);
2356 vfitab = _mm256_cvttps_epi32(rt);
2357 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2358 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2359 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2360 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2361 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2362 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2364 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2365 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2366 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2367 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2368 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2369 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2370 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2371 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2372 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2373 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2374 Heps = _mm256_mul_ps(vfeps,H);
2375 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2376 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2377 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2379 /* CUBIC SPLINE TABLE DISPERSION */
2380 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2381 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2382 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2383 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2384 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2385 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2386 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2387 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2388 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2389 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2390 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2391 Heps = _mm256_mul_ps(vfeps,H);
2392 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2393 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2394 fvdw6 = _mm256_mul_ps(c6_00,FF);
2396 /* CUBIC SPLINE TABLE REPULSION */
2397 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2398 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2399 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2400 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2401 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2402 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2403 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2404 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2405 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2406 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2407 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2408 Heps = _mm256_mul_ps(vfeps,H);
2409 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2410 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2411 fvdw12 = _mm256_mul_ps(c12_00,FF);
2412 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
2414 fscal = _mm256_add_ps(felec,fvdw);
2416 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2418 /* Calculate temporary vectorial force */
2419 tx = _mm256_mul_ps(fscal,dx00);
2420 ty = _mm256_mul_ps(fscal,dy00);
2421 tz = _mm256_mul_ps(fscal,dz00);
2423 /* Update vectorial force */
2424 fix0 = _mm256_add_ps(fix0,tx);
2425 fiy0 = _mm256_add_ps(fiy0,ty);
2426 fiz0 = _mm256_add_ps(fiz0,tz);
2428 fjx0 = _mm256_add_ps(fjx0,tx);
2429 fjy0 = _mm256_add_ps(fjy0,ty);
2430 fjz0 = _mm256_add_ps(fjz0,tz);
2432 /**************************
2433 * CALCULATE INTERACTIONS *
2434 **************************/
2436 r01 = _mm256_mul_ps(rsq01,rinv01);
2437 r01 = _mm256_andnot_ps(dummy_mask,r01);
2439 /* Calculate table index by multiplying r with table scale and truncate to integer */
2440 rt = _mm256_mul_ps(r01,vftabscale);
2441 vfitab = _mm256_cvttps_epi32(rt);
2442 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2443 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2444 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2445 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2446 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2447 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2449 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2450 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2451 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2452 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2453 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2454 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2455 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2456 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2457 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2458 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2459 Heps = _mm256_mul_ps(vfeps,H);
2460 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2461 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2462 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2466 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2468 /* Calculate temporary vectorial force */
2469 tx = _mm256_mul_ps(fscal,dx01);
2470 ty = _mm256_mul_ps(fscal,dy01);
2471 tz = _mm256_mul_ps(fscal,dz01);
2473 /* Update vectorial force */
2474 fix0 = _mm256_add_ps(fix0,tx);
2475 fiy0 = _mm256_add_ps(fiy0,ty);
2476 fiz0 = _mm256_add_ps(fiz0,tz);
2478 fjx1 = _mm256_add_ps(fjx1,tx);
2479 fjy1 = _mm256_add_ps(fjy1,ty);
2480 fjz1 = _mm256_add_ps(fjz1,tz);
2482 /**************************
2483 * CALCULATE INTERACTIONS *
2484 **************************/
2486 r02 = _mm256_mul_ps(rsq02,rinv02);
2487 r02 = _mm256_andnot_ps(dummy_mask,r02);
2489 /* Calculate table index by multiplying r with table scale and truncate to integer */
2490 rt = _mm256_mul_ps(r02,vftabscale);
2491 vfitab = _mm256_cvttps_epi32(rt);
2492 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2493 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2494 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2495 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2496 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2497 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2499 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2500 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2501 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2502 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2503 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2504 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2505 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2506 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2507 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2508 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2509 Heps = _mm256_mul_ps(vfeps,H);
2510 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2511 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2512 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2516 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2518 /* Calculate temporary vectorial force */
2519 tx = _mm256_mul_ps(fscal,dx02);
2520 ty = _mm256_mul_ps(fscal,dy02);
2521 tz = _mm256_mul_ps(fscal,dz02);
2523 /* Update vectorial force */
2524 fix0 = _mm256_add_ps(fix0,tx);
2525 fiy0 = _mm256_add_ps(fiy0,ty);
2526 fiz0 = _mm256_add_ps(fiz0,tz);
2528 fjx2 = _mm256_add_ps(fjx2,tx);
2529 fjy2 = _mm256_add_ps(fjy2,ty);
2530 fjz2 = _mm256_add_ps(fjz2,tz);
2532 /**************************
2533 * CALCULATE INTERACTIONS *
2534 **************************/
2536 r10 = _mm256_mul_ps(rsq10,rinv10);
2537 r10 = _mm256_andnot_ps(dummy_mask,r10);
2539 /* Calculate table index by multiplying r with table scale and truncate to integer */
2540 rt = _mm256_mul_ps(r10,vftabscale);
2541 vfitab = _mm256_cvttps_epi32(rt);
2542 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2543 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2544 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2545 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2546 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2547 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2549 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2550 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2551 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2552 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2553 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2554 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2555 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2556 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2557 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2558 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2559 Heps = _mm256_mul_ps(vfeps,H);
2560 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2561 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2562 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2566 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2568 /* Calculate temporary vectorial force */
2569 tx = _mm256_mul_ps(fscal,dx10);
2570 ty = _mm256_mul_ps(fscal,dy10);
2571 tz = _mm256_mul_ps(fscal,dz10);
2573 /* Update vectorial force */
2574 fix1 = _mm256_add_ps(fix1,tx);
2575 fiy1 = _mm256_add_ps(fiy1,ty);
2576 fiz1 = _mm256_add_ps(fiz1,tz);
2578 fjx0 = _mm256_add_ps(fjx0,tx);
2579 fjy0 = _mm256_add_ps(fjy0,ty);
2580 fjz0 = _mm256_add_ps(fjz0,tz);
2582 /**************************
2583 * CALCULATE INTERACTIONS *
2584 **************************/
2586 r11 = _mm256_mul_ps(rsq11,rinv11);
2587 r11 = _mm256_andnot_ps(dummy_mask,r11);
2589 /* Calculate table index by multiplying r with table scale and truncate to integer */
2590 rt = _mm256_mul_ps(r11,vftabscale);
2591 vfitab = _mm256_cvttps_epi32(rt);
2592 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2593 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2594 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2595 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2596 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2597 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2599 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2600 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2601 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2602 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2603 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2604 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2605 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2606 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2607 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2608 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2609 Heps = _mm256_mul_ps(vfeps,H);
2610 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2611 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2612 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2616 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2618 /* Calculate temporary vectorial force */
2619 tx = _mm256_mul_ps(fscal,dx11);
2620 ty = _mm256_mul_ps(fscal,dy11);
2621 tz = _mm256_mul_ps(fscal,dz11);
2623 /* Update vectorial force */
2624 fix1 = _mm256_add_ps(fix1,tx);
2625 fiy1 = _mm256_add_ps(fiy1,ty);
2626 fiz1 = _mm256_add_ps(fiz1,tz);
2628 fjx1 = _mm256_add_ps(fjx1,tx);
2629 fjy1 = _mm256_add_ps(fjy1,ty);
2630 fjz1 = _mm256_add_ps(fjz1,tz);
2632 /**************************
2633 * CALCULATE INTERACTIONS *
2634 **************************/
2636 r12 = _mm256_mul_ps(rsq12,rinv12);
2637 r12 = _mm256_andnot_ps(dummy_mask,r12);
2639 /* Calculate table index by multiplying r with table scale and truncate to integer */
2640 rt = _mm256_mul_ps(r12,vftabscale);
2641 vfitab = _mm256_cvttps_epi32(rt);
2642 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2643 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2644 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2645 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2646 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2647 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2649 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2650 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2651 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2652 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2653 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2654 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2655 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2656 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2657 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2658 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2659 Heps = _mm256_mul_ps(vfeps,H);
2660 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2661 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2662 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2666 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2668 /* Calculate temporary vectorial force */
2669 tx = _mm256_mul_ps(fscal,dx12);
2670 ty = _mm256_mul_ps(fscal,dy12);
2671 tz = _mm256_mul_ps(fscal,dz12);
2673 /* Update vectorial force */
2674 fix1 = _mm256_add_ps(fix1,tx);
2675 fiy1 = _mm256_add_ps(fiy1,ty);
2676 fiz1 = _mm256_add_ps(fiz1,tz);
2678 fjx2 = _mm256_add_ps(fjx2,tx);
2679 fjy2 = _mm256_add_ps(fjy2,ty);
2680 fjz2 = _mm256_add_ps(fjz2,tz);
2682 /**************************
2683 * CALCULATE INTERACTIONS *
2684 **************************/
2686 r20 = _mm256_mul_ps(rsq20,rinv20);
2687 r20 = _mm256_andnot_ps(dummy_mask,r20);
2689 /* Calculate table index by multiplying r with table scale and truncate to integer */
2690 rt = _mm256_mul_ps(r20,vftabscale);
2691 vfitab = _mm256_cvttps_epi32(rt);
2692 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2693 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2694 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2695 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2696 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2697 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2699 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2700 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2701 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2702 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2703 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2704 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2705 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2706 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2707 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2708 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2709 Heps = _mm256_mul_ps(vfeps,H);
2710 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2711 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2712 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2716 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2718 /* Calculate temporary vectorial force */
2719 tx = _mm256_mul_ps(fscal,dx20);
2720 ty = _mm256_mul_ps(fscal,dy20);
2721 tz = _mm256_mul_ps(fscal,dz20);
2723 /* Update vectorial force */
2724 fix2 = _mm256_add_ps(fix2,tx);
2725 fiy2 = _mm256_add_ps(fiy2,ty);
2726 fiz2 = _mm256_add_ps(fiz2,tz);
2728 fjx0 = _mm256_add_ps(fjx0,tx);
2729 fjy0 = _mm256_add_ps(fjy0,ty);
2730 fjz0 = _mm256_add_ps(fjz0,tz);
2732 /**************************
2733 * CALCULATE INTERACTIONS *
2734 **************************/
2736 r21 = _mm256_mul_ps(rsq21,rinv21);
2737 r21 = _mm256_andnot_ps(dummy_mask,r21);
2739 /* Calculate table index by multiplying r with table scale and truncate to integer */
2740 rt = _mm256_mul_ps(r21,vftabscale);
2741 vfitab = _mm256_cvttps_epi32(rt);
2742 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2743 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2744 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2745 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2746 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2747 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2749 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2750 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2751 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2752 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2753 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2754 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2755 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2756 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2757 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2758 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2759 Heps = _mm256_mul_ps(vfeps,H);
2760 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2761 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2762 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2766 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2768 /* Calculate temporary vectorial force */
2769 tx = _mm256_mul_ps(fscal,dx21);
2770 ty = _mm256_mul_ps(fscal,dy21);
2771 tz = _mm256_mul_ps(fscal,dz21);
2773 /* Update vectorial force */
2774 fix2 = _mm256_add_ps(fix2,tx);
2775 fiy2 = _mm256_add_ps(fiy2,ty);
2776 fiz2 = _mm256_add_ps(fiz2,tz);
2778 fjx1 = _mm256_add_ps(fjx1,tx);
2779 fjy1 = _mm256_add_ps(fjy1,ty);
2780 fjz1 = _mm256_add_ps(fjz1,tz);
2782 /**************************
2783 * CALCULATE INTERACTIONS *
2784 **************************/
2786 r22 = _mm256_mul_ps(rsq22,rinv22);
2787 r22 = _mm256_andnot_ps(dummy_mask,r22);
2789 /* Calculate table index by multiplying r with table scale and truncate to integer */
2790 rt = _mm256_mul_ps(r22,vftabscale);
2791 vfitab = _mm256_cvttps_epi32(rt);
2792 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2793 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2794 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2795 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2796 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2797 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2799 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2800 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2801 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2802 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2803 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2804 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2805 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2806 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2807 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2808 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2809 Heps = _mm256_mul_ps(vfeps,H);
2810 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2811 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2812 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2816 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2818 /* Calculate temporary vectorial force */
2819 tx = _mm256_mul_ps(fscal,dx22);
2820 ty = _mm256_mul_ps(fscal,dy22);
2821 tz = _mm256_mul_ps(fscal,dz22);
2823 /* Update vectorial force */
2824 fix2 = _mm256_add_ps(fix2,tx);
2825 fiy2 = _mm256_add_ps(fiy2,ty);
2826 fiz2 = _mm256_add_ps(fiz2,tz);
2828 fjx2 = _mm256_add_ps(fjx2,tx);
2829 fjy2 = _mm256_add_ps(fjy2,ty);
2830 fjz2 = _mm256_add_ps(fjz2,tz);
2832 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2833 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2834 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2835 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2836 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2837 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2838 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2839 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2841 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2842 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2844 /* Inner loop uses 382 flops */
2847 /* End of innermost loop */
2849 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2850 f+i_coord_offset,fshift+i_shift_offset);
2852 /* Increment number of inner iterations */
2853 inneriter += j_index_end - j_index_start;
2855 /* Outer loop uses 18 flops */
2858 /* Increment number of outer iterations */
2861 /* Update outer/inner flops */
2863 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*382);