2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_256_single
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: None
54 * Geometry: Water4-Water4
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_256_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrE,jnrF,jnrG,jnrH;
76 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
85 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr1;
87 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 real * vdwioffsetptr3;
91 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
92 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
93 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
94 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
95 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
96 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
97 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
98 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
99 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
100 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
101 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
102 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
103 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
104 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
105 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
106 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
107 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
110 __m128i vfitab_lo,vfitab_hi;
111 __m128i ifour = _mm_set1_epi32(4);
112 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
114 __m256 dummy_mask,cutoff_mask;
115 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
116 __m256 one = _mm256_set1_ps(1.0);
117 __m256 two = _mm256_set1_ps(2.0);
123 jindex = nlist->jindex;
125 shiftidx = nlist->shift;
127 shiftvec = fr->shift_vec[0];
128 fshift = fr->fshift[0];
129 facel = _mm256_set1_ps(fr->epsfac);
130 charge = mdatoms->chargeA;
132 vftab = kernel_data->table_elec->data;
133 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
135 /* Setup water-specific parameters */
136 inr = nlist->iinr[0];
137 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
138 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
139 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
141 jq1 = _mm256_set1_ps(charge[inr+1]);
142 jq2 = _mm256_set1_ps(charge[inr+2]);
143 jq3 = _mm256_set1_ps(charge[inr+3]);
144 qq11 = _mm256_mul_ps(iq1,jq1);
145 qq12 = _mm256_mul_ps(iq1,jq2);
146 qq13 = _mm256_mul_ps(iq1,jq3);
147 qq21 = _mm256_mul_ps(iq2,jq1);
148 qq22 = _mm256_mul_ps(iq2,jq2);
149 qq23 = _mm256_mul_ps(iq2,jq3);
150 qq31 = _mm256_mul_ps(iq3,jq1);
151 qq32 = _mm256_mul_ps(iq3,jq2);
152 qq33 = _mm256_mul_ps(iq3,jq3);
154 /* Avoid stupid compiler warnings */
155 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
168 for(iidx=0;iidx<4*DIM;iidx++)
173 /* Start outer loop over neighborlists */
174 for(iidx=0; iidx<nri; iidx++)
176 /* Load shift vector for this list */
177 i_shift_offset = DIM*shiftidx[iidx];
179 /* Load limits for loop over neighbors */
180 j_index_start = jindex[iidx];
181 j_index_end = jindex[iidx+1];
183 /* Get outer coordinate index */
185 i_coord_offset = DIM*inr;
187 /* Load i particle coords and add shift vector */
188 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
189 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
191 fix1 = _mm256_setzero_ps();
192 fiy1 = _mm256_setzero_ps();
193 fiz1 = _mm256_setzero_ps();
194 fix2 = _mm256_setzero_ps();
195 fiy2 = _mm256_setzero_ps();
196 fiz2 = _mm256_setzero_ps();
197 fix3 = _mm256_setzero_ps();
198 fiy3 = _mm256_setzero_ps();
199 fiz3 = _mm256_setzero_ps();
201 /* Reset potential sums */
202 velecsum = _mm256_setzero_ps();
204 /* Start inner kernel loop */
205 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
208 /* Get j neighbor index, and coordinate index */
217 j_coord_offsetA = DIM*jnrA;
218 j_coord_offsetB = DIM*jnrB;
219 j_coord_offsetC = DIM*jnrC;
220 j_coord_offsetD = DIM*jnrD;
221 j_coord_offsetE = DIM*jnrE;
222 j_coord_offsetF = DIM*jnrF;
223 j_coord_offsetG = DIM*jnrG;
224 j_coord_offsetH = DIM*jnrH;
226 /* load j atom coordinates */
227 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
228 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
229 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
230 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
231 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
233 /* Calculate displacement vector */
234 dx11 = _mm256_sub_ps(ix1,jx1);
235 dy11 = _mm256_sub_ps(iy1,jy1);
236 dz11 = _mm256_sub_ps(iz1,jz1);
237 dx12 = _mm256_sub_ps(ix1,jx2);
238 dy12 = _mm256_sub_ps(iy1,jy2);
239 dz12 = _mm256_sub_ps(iz1,jz2);
240 dx13 = _mm256_sub_ps(ix1,jx3);
241 dy13 = _mm256_sub_ps(iy1,jy3);
242 dz13 = _mm256_sub_ps(iz1,jz3);
243 dx21 = _mm256_sub_ps(ix2,jx1);
244 dy21 = _mm256_sub_ps(iy2,jy1);
245 dz21 = _mm256_sub_ps(iz2,jz1);
246 dx22 = _mm256_sub_ps(ix2,jx2);
247 dy22 = _mm256_sub_ps(iy2,jy2);
248 dz22 = _mm256_sub_ps(iz2,jz2);
249 dx23 = _mm256_sub_ps(ix2,jx3);
250 dy23 = _mm256_sub_ps(iy2,jy3);
251 dz23 = _mm256_sub_ps(iz2,jz3);
252 dx31 = _mm256_sub_ps(ix3,jx1);
253 dy31 = _mm256_sub_ps(iy3,jy1);
254 dz31 = _mm256_sub_ps(iz3,jz1);
255 dx32 = _mm256_sub_ps(ix3,jx2);
256 dy32 = _mm256_sub_ps(iy3,jy2);
257 dz32 = _mm256_sub_ps(iz3,jz2);
258 dx33 = _mm256_sub_ps(ix3,jx3);
259 dy33 = _mm256_sub_ps(iy3,jy3);
260 dz33 = _mm256_sub_ps(iz3,jz3);
262 /* Calculate squared distance and things based on it */
263 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
264 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
265 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
266 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
267 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
268 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
269 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
270 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
271 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
273 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
274 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
275 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
276 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
277 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
278 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
279 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
280 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
281 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
283 fjx1 = _mm256_setzero_ps();
284 fjy1 = _mm256_setzero_ps();
285 fjz1 = _mm256_setzero_ps();
286 fjx2 = _mm256_setzero_ps();
287 fjy2 = _mm256_setzero_ps();
288 fjz2 = _mm256_setzero_ps();
289 fjx3 = _mm256_setzero_ps();
290 fjy3 = _mm256_setzero_ps();
291 fjz3 = _mm256_setzero_ps();
293 /**************************
294 * CALCULATE INTERACTIONS *
295 **************************/
297 r11 = _mm256_mul_ps(rsq11,rinv11);
299 /* Calculate table index by multiplying r with table scale and truncate to integer */
300 rt = _mm256_mul_ps(r11,vftabscale);
301 vfitab = _mm256_cvttps_epi32(rt);
302 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
303 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
304 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
305 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
306 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
307 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
309 /* CUBIC SPLINE TABLE ELECTROSTATICS */
310 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
311 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
312 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
313 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
314 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
315 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
316 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
317 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
318 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
319 Heps = _mm256_mul_ps(vfeps,H);
320 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
321 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
322 velec = _mm256_mul_ps(qq11,VV);
323 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
324 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
326 /* Update potential sum for this i atom from the interaction with this j atom. */
327 velecsum = _mm256_add_ps(velecsum,velec);
331 /* Calculate temporary vectorial force */
332 tx = _mm256_mul_ps(fscal,dx11);
333 ty = _mm256_mul_ps(fscal,dy11);
334 tz = _mm256_mul_ps(fscal,dz11);
336 /* Update vectorial force */
337 fix1 = _mm256_add_ps(fix1,tx);
338 fiy1 = _mm256_add_ps(fiy1,ty);
339 fiz1 = _mm256_add_ps(fiz1,tz);
341 fjx1 = _mm256_add_ps(fjx1,tx);
342 fjy1 = _mm256_add_ps(fjy1,ty);
343 fjz1 = _mm256_add_ps(fjz1,tz);
345 /**************************
346 * CALCULATE INTERACTIONS *
347 **************************/
349 r12 = _mm256_mul_ps(rsq12,rinv12);
351 /* Calculate table index by multiplying r with table scale and truncate to integer */
352 rt = _mm256_mul_ps(r12,vftabscale);
353 vfitab = _mm256_cvttps_epi32(rt);
354 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
355 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
356 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
357 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
358 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
359 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
361 /* CUBIC SPLINE TABLE ELECTROSTATICS */
362 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
363 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
364 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
365 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
366 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
367 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
368 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
369 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
370 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
371 Heps = _mm256_mul_ps(vfeps,H);
372 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
373 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
374 velec = _mm256_mul_ps(qq12,VV);
375 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
376 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
378 /* Update potential sum for this i atom from the interaction with this j atom. */
379 velecsum = _mm256_add_ps(velecsum,velec);
383 /* Calculate temporary vectorial force */
384 tx = _mm256_mul_ps(fscal,dx12);
385 ty = _mm256_mul_ps(fscal,dy12);
386 tz = _mm256_mul_ps(fscal,dz12);
388 /* Update vectorial force */
389 fix1 = _mm256_add_ps(fix1,tx);
390 fiy1 = _mm256_add_ps(fiy1,ty);
391 fiz1 = _mm256_add_ps(fiz1,tz);
393 fjx2 = _mm256_add_ps(fjx2,tx);
394 fjy2 = _mm256_add_ps(fjy2,ty);
395 fjz2 = _mm256_add_ps(fjz2,tz);
397 /**************************
398 * CALCULATE INTERACTIONS *
399 **************************/
401 r13 = _mm256_mul_ps(rsq13,rinv13);
403 /* Calculate table index by multiplying r with table scale and truncate to integer */
404 rt = _mm256_mul_ps(r13,vftabscale);
405 vfitab = _mm256_cvttps_epi32(rt);
406 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
407 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
408 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
409 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
410 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
411 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
413 /* CUBIC SPLINE TABLE ELECTROSTATICS */
414 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
415 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
416 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
417 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
418 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
419 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
420 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
421 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
422 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
423 Heps = _mm256_mul_ps(vfeps,H);
424 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
425 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
426 velec = _mm256_mul_ps(qq13,VV);
427 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
428 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velecsum = _mm256_add_ps(velecsum,velec);
435 /* Calculate temporary vectorial force */
436 tx = _mm256_mul_ps(fscal,dx13);
437 ty = _mm256_mul_ps(fscal,dy13);
438 tz = _mm256_mul_ps(fscal,dz13);
440 /* Update vectorial force */
441 fix1 = _mm256_add_ps(fix1,tx);
442 fiy1 = _mm256_add_ps(fiy1,ty);
443 fiz1 = _mm256_add_ps(fiz1,tz);
445 fjx3 = _mm256_add_ps(fjx3,tx);
446 fjy3 = _mm256_add_ps(fjy3,ty);
447 fjz3 = _mm256_add_ps(fjz3,tz);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 r21 = _mm256_mul_ps(rsq21,rinv21);
455 /* Calculate table index by multiplying r with table scale and truncate to integer */
456 rt = _mm256_mul_ps(r21,vftabscale);
457 vfitab = _mm256_cvttps_epi32(rt);
458 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
459 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
460 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
461 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
462 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
463 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
465 /* CUBIC SPLINE TABLE ELECTROSTATICS */
466 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
467 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
468 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
469 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
470 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
471 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
472 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
473 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
474 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
475 Heps = _mm256_mul_ps(vfeps,H);
476 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
477 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
478 velec = _mm256_mul_ps(qq21,VV);
479 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
480 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
482 /* Update potential sum for this i atom from the interaction with this j atom. */
483 velecsum = _mm256_add_ps(velecsum,velec);
487 /* Calculate temporary vectorial force */
488 tx = _mm256_mul_ps(fscal,dx21);
489 ty = _mm256_mul_ps(fscal,dy21);
490 tz = _mm256_mul_ps(fscal,dz21);
492 /* Update vectorial force */
493 fix2 = _mm256_add_ps(fix2,tx);
494 fiy2 = _mm256_add_ps(fiy2,ty);
495 fiz2 = _mm256_add_ps(fiz2,tz);
497 fjx1 = _mm256_add_ps(fjx1,tx);
498 fjy1 = _mm256_add_ps(fjy1,ty);
499 fjz1 = _mm256_add_ps(fjz1,tz);
501 /**************************
502 * CALCULATE INTERACTIONS *
503 **************************/
505 r22 = _mm256_mul_ps(rsq22,rinv22);
507 /* Calculate table index by multiplying r with table scale and truncate to integer */
508 rt = _mm256_mul_ps(r22,vftabscale);
509 vfitab = _mm256_cvttps_epi32(rt);
510 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
511 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
512 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
513 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
514 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
515 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
517 /* CUBIC SPLINE TABLE ELECTROSTATICS */
518 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
519 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
520 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
521 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
522 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
523 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
524 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
525 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
526 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
527 Heps = _mm256_mul_ps(vfeps,H);
528 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
529 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
530 velec = _mm256_mul_ps(qq22,VV);
531 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
532 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 velecsum = _mm256_add_ps(velecsum,velec);
539 /* Calculate temporary vectorial force */
540 tx = _mm256_mul_ps(fscal,dx22);
541 ty = _mm256_mul_ps(fscal,dy22);
542 tz = _mm256_mul_ps(fscal,dz22);
544 /* Update vectorial force */
545 fix2 = _mm256_add_ps(fix2,tx);
546 fiy2 = _mm256_add_ps(fiy2,ty);
547 fiz2 = _mm256_add_ps(fiz2,tz);
549 fjx2 = _mm256_add_ps(fjx2,tx);
550 fjy2 = _mm256_add_ps(fjy2,ty);
551 fjz2 = _mm256_add_ps(fjz2,tz);
553 /**************************
554 * CALCULATE INTERACTIONS *
555 **************************/
557 r23 = _mm256_mul_ps(rsq23,rinv23);
559 /* Calculate table index by multiplying r with table scale and truncate to integer */
560 rt = _mm256_mul_ps(r23,vftabscale);
561 vfitab = _mm256_cvttps_epi32(rt);
562 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
563 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
564 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
565 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
566 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
567 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
569 /* CUBIC SPLINE TABLE ELECTROSTATICS */
570 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
571 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
572 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
573 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
574 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
575 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
576 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
577 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
578 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
579 Heps = _mm256_mul_ps(vfeps,H);
580 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
581 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
582 velec = _mm256_mul_ps(qq23,VV);
583 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
584 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
586 /* Update potential sum for this i atom from the interaction with this j atom. */
587 velecsum = _mm256_add_ps(velecsum,velec);
591 /* Calculate temporary vectorial force */
592 tx = _mm256_mul_ps(fscal,dx23);
593 ty = _mm256_mul_ps(fscal,dy23);
594 tz = _mm256_mul_ps(fscal,dz23);
596 /* Update vectorial force */
597 fix2 = _mm256_add_ps(fix2,tx);
598 fiy2 = _mm256_add_ps(fiy2,ty);
599 fiz2 = _mm256_add_ps(fiz2,tz);
601 fjx3 = _mm256_add_ps(fjx3,tx);
602 fjy3 = _mm256_add_ps(fjy3,ty);
603 fjz3 = _mm256_add_ps(fjz3,tz);
605 /**************************
606 * CALCULATE INTERACTIONS *
607 **************************/
609 r31 = _mm256_mul_ps(rsq31,rinv31);
611 /* Calculate table index by multiplying r with table scale and truncate to integer */
612 rt = _mm256_mul_ps(r31,vftabscale);
613 vfitab = _mm256_cvttps_epi32(rt);
614 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
615 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
616 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
617 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
618 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
619 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
621 /* CUBIC SPLINE TABLE ELECTROSTATICS */
622 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
623 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
624 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
625 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
626 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
627 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
628 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
629 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
630 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
631 Heps = _mm256_mul_ps(vfeps,H);
632 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
633 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
634 velec = _mm256_mul_ps(qq31,VV);
635 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
636 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
638 /* Update potential sum for this i atom from the interaction with this j atom. */
639 velecsum = _mm256_add_ps(velecsum,velec);
643 /* Calculate temporary vectorial force */
644 tx = _mm256_mul_ps(fscal,dx31);
645 ty = _mm256_mul_ps(fscal,dy31);
646 tz = _mm256_mul_ps(fscal,dz31);
648 /* Update vectorial force */
649 fix3 = _mm256_add_ps(fix3,tx);
650 fiy3 = _mm256_add_ps(fiy3,ty);
651 fiz3 = _mm256_add_ps(fiz3,tz);
653 fjx1 = _mm256_add_ps(fjx1,tx);
654 fjy1 = _mm256_add_ps(fjy1,ty);
655 fjz1 = _mm256_add_ps(fjz1,tz);
657 /**************************
658 * CALCULATE INTERACTIONS *
659 **************************/
661 r32 = _mm256_mul_ps(rsq32,rinv32);
663 /* Calculate table index by multiplying r with table scale and truncate to integer */
664 rt = _mm256_mul_ps(r32,vftabscale);
665 vfitab = _mm256_cvttps_epi32(rt);
666 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
667 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
668 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
669 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
670 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
671 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
673 /* CUBIC SPLINE TABLE ELECTROSTATICS */
674 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
675 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
676 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
677 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
678 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
679 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
680 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
681 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
682 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
683 Heps = _mm256_mul_ps(vfeps,H);
684 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
685 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
686 velec = _mm256_mul_ps(qq32,VV);
687 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
688 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
690 /* Update potential sum for this i atom from the interaction with this j atom. */
691 velecsum = _mm256_add_ps(velecsum,velec);
695 /* Calculate temporary vectorial force */
696 tx = _mm256_mul_ps(fscal,dx32);
697 ty = _mm256_mul_ps(fscal,dy32);
698 tz = _mm256_mul_ps(fscal,dz32);
700 /* Update vectorial force */
701 fix3 = _mm256_add_ps(fix3,tx);
702 fiy3 = _mm256_add_ps(fiy3,ty);
703 fiz3 = _mm256_add_ps(fiz3,tz);
705 fjx2 = _mm256_add_ps(fjx2,tx);
706 fjy2 = _mm256_add_ps(fjy2,ty);
707 fjz2 = _mm256_add_ps(fjz2,tz);
709 /**************************
710 * CALCULATE INTERACTIONS *
711 **************************/
713 r33 = _mm256_mul_ps(rsq33,rinv33);
715 /* Calculate table index by multiplying r with table scale and truncate to integer */
716 rt = _mm256_mul_ps(r33,vftabscale);
717 vfitab = _mm256_cvttps_epi32(rt);
718 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
719 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
720 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
721 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
722 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
723 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
725 /* CUBIC SPLINE TABLE ELECTROSTATICS */
726 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
727 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
728 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
729 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
730 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
731 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
732 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
733 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
734 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
735 Heps = _mm256_mul_ps(vfeps,H);
736 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
737 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
738 velec = _mm256_mul_ps(qq33,VV);
739 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
740 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
742 /* Update potential sum for this i atom from the interaction with this j atom. */
743 velecsum = _mm256_add_ps(velecsum,velec);
747 /* Calculate temporary vectorial force */
748 tx = _mm256_mul_ps(fscal,dx33);
749 ty = _mm256_mul_ps(fscal,dy33);
750 tz = _mm256_mul_ps(fscal,dz33);
752 /* Update vectorial force */
753 fix3 = _mm256_add_ps(fix3,tx);
754 fiy3 = _mm256_add_ps(fiy3,ty);
755 fiz3 = _mm256_add_ps(fiz3,tz);
757 fjx3 = _mm256_add_ps(fjx3,tx);
758 fjy3 = _mm256_add_ps(fjy3,ty);
759 fjz3 = _mm256_add_ps(fjz3,tz);
761 fjptrA = f+j_coord_offsetA;
762 fjptrB = f+j_coord_offsetB;
763 fjptrC = f+j_coord_offsetC;
764 fjptrD = f+j_coord_offsetD;
765 fjptrE = f+j_coord_offsetE;
766 fjptrF = f+j_coord_offsetF;
767 fjptrG = f+j_coord_offsetG;
768 fjptrH = f+j_coord_offsetH;
770 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
771 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
772 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
774 /* Inner loop uses 387 flops */
780 /* Get j neighbor index, and coordinate index */
781 jnrlistA = jjnr[jidx];
782 jnrlistB = jjnr[jidx+1];
783 jnrlistC = jjnr[jidx+2];
784 jnrlistD = jjnr[jidx+3];
785 jnrlistE = jjnr[jidx+4];
786 jnrlistF = jjnr[jidx+5];
787 jnrlistG = jjnr[jidx+6];
788 jnrlistH = jjnr[jidx+7];
789 /* Sign of each element will be negative for non-real atoms.
790 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
791 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
793 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
794 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
796 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
797 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
798 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
799 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
800 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
801 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
802 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
803 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
804 j_coord_offsetA = DIM*jnrA;
805 j_coord_offsetB = DIM*jnrB;
806 j_coord_offsetC = DIM*jnrC;
807 j_coord_offsetD = DIM*jnrD;
808 j_coord_offsetE = DIM*jnrE;
809 j_coord_offsetF = DIM*jnrF;
810 j_coord_offsetG = DIM*jnrG;
811 j_coord_offsetH = DIM*jnrH;
813 /* load j atom coordinates */
814 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
815 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
816 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
817 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
818 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
820 /* Calculate displacement vector */
821 dx11 = _mm256_sub_ps(ix1,jx1);
822 dy11 = _mm256_sub_ps(iy1,jy1);
823 dz11 = _mm256_sub_ps(iz1,jz1);
824 dx12 = _mm256_sub_ps(ix1,jx2);
825 dy12 = _mm256_sub_ps(iy1,jy2);
826 dz12 = _mm256_sub_ps(iz1,jz2);
827 dx13 = _mm256_sub_ps(ix1,jx3);
828 dy13 = _mm256_sub_ps(iy1,jy3);
829 dz13 = _mm256_sub_ps(iz1,jz3);
830 dx21 = _mm256_sub_ps(ix2,jx1);
831 dy21 = _mm256_sub_ps(iy2,jy1);
832 dz21 = _mm256_sub_ps(iz2,jz1);
833 dx22 = _mm256_sub_ps(ix2,jx2);
834 dy22 = _mm256_sub_ps(iy2,jy2);
835 dz22 = _mm256_sub_ps(iz2,jz2);
836 dx23 = _mm256_sub_ps(ix2,jx3);
837 dy23 = _mm256_sub_ps(iy2,jy3);
838 dz23 = _mm256_sub_ps(iz2,jz3);
839 dx31 = _mm256_sub_ps(ix3,jx1);
840 dy31 = _mm256_sub_ps(iy3,jy1);
841 dz31 = _mm256_sub_ps(iz3,jz1);
842 dx32 = _mm256_sub_ps(ix3,jx2);
843 dy32 = _mm256_sub_ps(iy3,jy2);
844 dz32 = _mm256_sub_ps(iz3,jz2);
845 dx33 = _mm256_sub_ps(ix3,jx3);
846 dy33 = _mm256_sub_ps(iy3,jy3);
847 dz33 = _mm256_sub_ps(iz3,jz3);
849 /* Calculate squared distance and things based on it */
850 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
851 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
852 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
853 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
854 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
855 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
856 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
857 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
858 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
860 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
861 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
862 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
863 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
864 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
865 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
866 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
867 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
868 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
870 fjx1 = _mm256_setzero_ps();
871 fjy1 = _mm256_setzero_ps();
872 fjz1 = _mm256_setzero_ps();
873 fjx2 = _mm256_setzero_ps();
874 fjy2 = _mm256_setzero_ps();
875 fjz2 = _mm256_setzero_ps();
876 fjx3 = _mm256_setzero_ps();
877 fjy3 = _mm256_setzero_ps();
878 fjz3 = _mm256_setzero_ps();
880 /**************************
881 * CALCULATE INTERACTIONS *
882 **************************/
884 r11 = _mm256_mul_ps(rsq11,rinv11);
885 r11 = _mm256_andnot_ps(dummy_mask,r11);
887 /* Calculate table index by multiplying r with table scale and truncate to integer */
888 rt = _mm256_mul_ps(r11,vftabscale);
889 vfitab = _mm256_cvttps_epi32(rt);
890 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
891 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
892 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
893 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
894 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
895 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
897 /* CUBIC SPLINE TABLE ELECTROSTATICS */
898 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
899 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
900 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
901 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
902 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
903 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
904 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
905 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
906 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
907 Heps = _mm256_mul_ps(vfeps,H);
908 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
909 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
910 velec = _mm256_mul_ps(qq11,VV);
911 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
912 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
914 /* Update potential sum for this i atom from the interaction with this j atom. */
915 velec = _mm256_andnot_ps(dummy_mask,velec);
916 velecsum = _mm256_add_ps(velecsum,velec);
920 fscal = _mm256_andnot_ps(dummy_mask,fscal);
922 /* Calculate temporary vectorial force */
923 tx = _mm256_mul_ps(fscal,dx11);
924 ty = _mm256_mul_ps(fscal,dy11);
925 tz = _mm256_mul_ps(fscal,dz11);
927 /* Update vectorial force */
928 fix1 = _mm256_add_ps(fix1,tx);
929 fiy1 = _mm256_add_ps(fiy1,ty);
930 fiz1 = _mm256_add_ps(fiz1,tz);
932 fjx1 = _mm256_add_ps(fjx1,tx);
933 fjy1 = _mm256_add_ps(fjy1,ty);
934 fjz1 = _mm256_add_ps(fjz1,tz);
936 /**************************
937 * CALCULATE INTERACTIONS *
938 **************************/
940 r12 = _mm256_mul_ps(rsq12,rinv12);
941 r12 = _mm256_andnot_ps(dummy_mask,r12);
943 /* Calculate table index by multiplying r with table scale and truncate to integer */
944 rt = _mm256_mul_ps(r12,vftabscale);
945 vfitab = _mm256_cvttps_epi32(rt);
946 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
947 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
948 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
949 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
950 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
951 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
953 /* CUBIC SPLINE TABLE ELECTROSTATICS */
954 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
955 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
956 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
957 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
958 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
959 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
960 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
961 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
962 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
963 Heps = _mm256_mul_ps(vfeps,H);
964 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
965 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
966 velec = _mm256_mul_ps(qq12,VV);
967 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
968 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
970 /* Update potential sum for this i atom from the interaction with this j atom. */
971 velec = _mm256_andnot_ps(dummy_mask,velec);
972 velecsum = _mm256_add_ps(velecsum,velec);
976 fscal = _mm256_andnot_ps(dummy_mask,fscal);
978 /* Calculate temporary vectorial force */
979 tx = _mm256_mul_ps(fscal,dx12);
980 ty = _mm256_mul_ps(fscal,dy12);
981 tz = _mm256_mul_ps(fscal,dz12);
983 /* Update vectorial force */
984 fix1 = _mm256_add_ps(fix1,tx);
985 fiy1 = _mm256_add_ps(fiy1,ty);
986 fiz1 = _mm256_add_ps(fiz1,tz);
988 fjx2 = _mm256_add_ps(fjx2,tx);
989 fjy2 = _mm256_add_ps(fjy2,ty);
990 fjz2 = _mm256_add_ps(fjz2,tz);
992 /**************************
993 * CALCULATE INTERACTIONS *
994 **************************/
996 r13 = _mm256_mul_ps(rsq13,rinv13);
997 r13 = _mm256_andnot_ps(dummy_mask,r13);
999 /* Calculate table index by multiplying r with table scale and truncate to integer */
1000 rt = _mm256_mul_ps(r13,vftabscale);
1001 vfitab = _mm256_cvttps_epi32(rt);
1002 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1003 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1004 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1005 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1006 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1007 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1009 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1010 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1011 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1012 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1013 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1014 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1015 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1016 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1017 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1018 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1019 Heps = _mm256_mul_ps(vfeps,H);
1020 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1021 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1022 velec = _mm256_mul_ps(qq13,VV);
1023 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1024 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
1026 /* Update potential sum for this i atom from the interaction with this j atom. */
1027 velec = _mm256_andnot_ps(dummy_mask,velec);
1028 velecsum = _mm256_add_ps(velecsum,velec);
1032 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1034 /* Calculate temporary vectorial force */
1035 tx = _mm256_mul_ps(fscal,dx13);
1036 ty = _mm256_mul_ps(fscal,dy13);
1037 tz = _mm256_mul_ps(fscal,dz13);
1039 /* Update vectorial force */
1040 fix1 = _mm256_add_ps(fix1,tx);
1041 fiy1 = _mm256_add_ps(fiy1,ty);
1042 fiz1 = _mm256_add_ps(fiz1,tz);
1044 fjx3 = _mm256_add_ps(fjx3,tx);
1045 fjy3 = _mm256_add_ps(fjy3,ty);
1046 fjz3 = _mm256_add_ps(fjz3,tz);
1048 /**************************
1049 * CALCULATE INTERACTIONS *
1050 **************************/
1052 r21 = _mm256_mul_ps(rsq21,rinv21);
1053 r21 = _mm256_andnot_ps(dummy_mask,r21);
1055 /* Calculate table index by multiplying r with table scale and truncate to integer */
1056 rt = _mm256_mul_ps(r21,vftabscale);
1057 vfitab = _mm256_cvttps_epi32(rt);
1058 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1059 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1060 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1061 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1062 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1063 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1065 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1066 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1067 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1068 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1069 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1070 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1071 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1072 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1073 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1074 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1075 Heps = _mm256_mul_ps(vfeps,H);
1076 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1077 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1078 velec = _mm256_mul_ps(qq21,VV);
1079 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1080 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1082 /* Update potential sum for this i atom from the interaction with this j atom. */
1083 velec = _mm256_andnot_ps(dummy_mask,velec);
1084 velecsum = _mm256_add_ps(velecsum,velec);
1088 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1090 /* Calculate temporary vectorial force */
1091 tx = _mm256_mul_ps(fscal,dx21);
1092 ty = _mm256_mul_ps(fscal,dy21);
1093 tz = _mm256_mul_ps(fscal,dz21);
1095 /* Update vectorial force */
1096 fix2 = _mm256_add_ps(fix2,tx);
1097 fiy2 = _mm256_add_ps(fiy2,ty);
1098 fiz2 = _mm256_add_ps(fiz2,tz);
1100 fjx1 = _mm256_add_ps(fjx1,tx);
1101 fjy1 = _mm256_add_ps(fjy1,ty);
1102 fjz1 = _mm256_add_ps(fjz1,tz);
1104 /**************************
1105 * CALCULATE INTERACTIONS *
1106 **************************/
1108 r22 = _mm256_mul_ps(rsq22,rinv22);
1109 r22 = _mm256_andnot_ps(dummy_mask,r22);
1111 /* Calculate table index by multiplying r with table scale and truncate to integer */
1112 rt = _mm256_mul_ps(r22,vftabscale);
1113 vfitab = _mm256_cvttps_epi32(rt);
1114 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1115 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1116 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1117 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1118 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1119 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1121 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1122 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1123 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1124 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1125 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1126 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1127 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1128 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1129 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1130 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1131 Heps = _mm256_mul_ps(vfeps,H);
1132 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1133 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1134 velec = _mm256_mul_ps(qq22,VV);
1135 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1136 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1138 /* Update potential sum for this i atom from the interaction with this j atom. */
1139 velec = _mm256_andnot_ps(dummy_mask,velec);
1140 velecsum = _mm256_add_ps(velecsum,velec);
1144 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1146 /* Calculate temporary vectorial force */
1147 tx = _mm256_mul_ps(fscal,dx22);
1148 ty = _mm256_mul_ps(fscal,dy22);
1149 tz = _mm256_mul_ps(fscal,dz22);
1151 /* Update vectorial force */
1152 fix2 = _mm256_add_ps(fix2,tx);
1153 fiy2 = _mm256_add_ps(fiy2,ty);
1154 fiz2 = _mm256_add_ps(fiz2,tz);
1156 fjx2 = _mm256_add_ps(fjx2,tx);
1157 fjy2 = _mm256_add_ps(fjy2,ty);
1158 fjz2 = _mm256_add_ps(fjz2,tz);
1160 /**************************
1161 * CALCULATE INTERACTIONS *
1162 **************************/
1164 r23 = _mm256_mul_ps(rsq23,rinv23);
1165 r23 = _mm256_andnot_ps(dummy_mask,r23);
1167 /* Calculate table index by multiplying r with table scale and truncate to integer */
1168 rt = _mm256_mul_ps(r23,vftabscale);
1169 vfitab = _mm256_cvttps_epi32(rt);
1170 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1171 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1172 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1173 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1174 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1175 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1177 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1178 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1179 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1180 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1181 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1182 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1183 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1184 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1185 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1186 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1187 Heps = _mm256_mul_ps(vfeps,H);
1188 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1189 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1190 velec = _mm256_mul_ps(qq23,VV);
1191 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1192 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
1194 /* Update potential sum for this i atom from the interaction with this j atom. */
1195 velec = _mm256_andnot_ps(dummy_mask,velec);
1196 velecsum = _mm256_add_ps(velecsum,velec);
1200 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1202 /* Calculate temporary vectorial force */
1203 tx = _mm256_mul_ps(fscal,dx23);
1204 ty = _mm256_mul_ps(fscal,dy23);
1205 tz = _mm256_mul_ps(fscal,dz23);
1207 /* Update vectorial force */
1208 fix2 = _mm256_add_ps(fix2,tx);
1209 fiy2 = _mm256_add_ps(fiy2,ty);
1210 fiz2 = _mm256_add_ps(fiz2,tz);
1212 fjx3 = _mm256_add_ps(fjx3,tx);
1213 fjy3 = _mm256_add_ps(fjy3,ty);
1214 fjz3 = _mm256_add_ps(fjz3,tz);
1216 /**************************
1217 * CALCULATE INTERACTIONS *
1218 **************************/
1220 r31 = _mm256_mul_ps(rsq31,rinv31);
1221 r31 = _mm256_andnot_ps(dummy_mask,r31);
1223 /* Calculate table index by multiplying r with table scale and truncate to integer */
1224 rt = _mm256_mul_ps(r31,vftabscale);
1225 vfitab = _mm256_cvttps_epi32(rt);
1226 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1227 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1228 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1229 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1230 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1231 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1233 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1234 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1235 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1236 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1237 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1238 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1239 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1240 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1241 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1242 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1243 Heps = _mm256_mul_ps(vfeps,H);
1244 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1245 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1246 velec = _mm256_mul_ps(qq31,VV);
1247 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1248 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
1250 /* Update potential sum for this i atom from the interaction with this j atom. */
1251 velec = _mm256_andnot_ps(dummy_mask,velec);
1252 velecsum = _mm256_add_ps(velecsum,velec);
1256 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1258 /* Calculate temporary vectorial force */
1259 tx = _mm256_mul_ps(fscal,dx31);
1260 ty = _mm256_mul_ps(fscal,dy31);
1261 tz = _mm256_mul_ps(fscal,dz31);
1263 /* Update vectorial force */
1264 fix3 = _mm256_add_ps(fix3,tx);
1265 fiy3 = _mm256_add_ps(fiy3,ty);
1266 fiz3 = _mm256_add_ps(fiz3,tz);
1268 fjx1 = _mm256_add_ps(fjx1,tx);
1269 fjy1 = _mm256_add_ps(fjy1,ty);
1270 fjz1 = _mm256_add_ps(fjz1,tz);
1272 /**************************
1273 * CALCULATE INTERACTIONS *
1274 **************************/
1276 r32 = _mm256_mul_ps(rsq32,rinv32);
1277 r32 = _mm256_andnot_ps(dummy_mask,r32);
1279 /* Calculate table index by multiplying r with table scale and truncate to integer */
1280 rt = _mm256_mul_ps(r32,vftabscale);
1281 vfitab = _mm256_cvttps_epi32(rt);
1282 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1283 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1284 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1285 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1286 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1287 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1289 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1290 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1291 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1292 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1293 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1294 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1295 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1296 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1297 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1298 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1299 Heps = _mm256_mul_ps(vfeps,H);
1300 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1301 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1302 velec = _mm256_mul_ps(qq32,VV);
1303 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1304 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
1306 /* Update potential sum for this i atom from the interaction with this j atom. */
1307 velec = _mm256_andnot_ps(dummy_mask,velec);
1308 velecsum = _mm256_add_ps(velecsum,velec);
1312 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1314 /* Calculate temporary vectorial force */
1315 tx = _mm256_mul_ps(fscal,dx32);
1316 ty = _mm256_mul_ps(fscal,dy32);
1317 tz = _mm256_mul_ps(fscal,dz32);
1319 /* Update vectorial force */
1320 fix3 = _mm256_add_ps(fix3,tx);
1321 fiy3 = _mm256_add_ps(fiy3,ty);
1322 fiz3 = _mm256_add_ps(fiz3,tz);
1324 fjx2 = _mm256_add_ps(fjx2,tx);
1325 fjy2 = _mm256_add_ps(fjy2,ty);
1326 fjz2 = _mm256_add_ps(fjz2,tz);
1328 /**************************
1329 * CALCULATE INTERACTIONS *
1330 **************************/
1332 r33 = _mm256_mul_ps(rsq33,rinv33);
1333 r33 = _mm256_andnot_ps(dummy_mask,r33);
1335 /* Calculate table index by multiplying r with table scale and truncate to integer */
1336 rt = _mm256_mul_ps(r33,vftabscale);
1337 vfitab = _mm256_cvttps_epi32(rt);
1338 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1339 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1340 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1341 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1342 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1343 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1345 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1346 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1347 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1348 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1349 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1350 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1351 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1352 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1353 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1354 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1355 Heps = _mm256_mul_ps(vfeps,H);
1356 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1357 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1358 velec = _mm256_mul_ps(qq33,VV);
1359 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1360 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
1362 /* Update potential sum for this i atom from the interaction with this j atom. */
1363 velec = _mm256_andnot_ps(dummy_mask,velec);
1364 velecsum = _mm256_add_ps(velecsum,velec);
1368 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1370 /* Calculate temporary vectorial force */
1371 tx = _mm256_mul_ps(fscal,dx33);
1372 ty = _mm256_mul_ps(fscal,dy33);
1373 tz = _mm256_mul_ps(fscal,dz33);
1375 /* Update vectorial force */
1376 fix3 = _mm256_add_ps(fix3,tx);
1377 fiy3 = _mm256_add_ps(fiy3,ty);
1378 fiz3 = _mm256_add_ps(fiz3,tz);
1380 fjx3 = _mm256_add_ps(fjx3,tx);
1381 fjy3 = _mm256_add_ps(fjy3,ty);
1382 fjz3 = _mm256_add_ps(fjz3,tz);
1384 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1385 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1386 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1387 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1388 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1389 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1390 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1391 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1393 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1394 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1395 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1397 /* Inner loop uses 396 flops */
1400 /* End of innermost loop */
1402 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1403 f+i_coord_offset+DIM,fshift+i_shift_offset);
1406 /* Update potential energies */
1407 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1409 /* Increment number of inner iterations */
1410 inneriter += j_index_end - j_index_start;
1412 /* Outer loop uses 19 flops */
1415 /* Increment number of outer iterations */
1418 /* Update outer/inner flops */
1420 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*396);
1423 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_256_single
1424 * Electrostatics interaction: CubicSplineTable
1425 * VdW interaction: None
1426 * Geometry: Water4-Water4
1427 * Calculate force/pot: Force
1430 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_256_single
1431 (t_nblist * gmx_restrict nlist,
1432 rvec * gmx_restrict xx,
1433 rvec * gmx_restrict ff,
1434 t_forcerec * gmx_restrict fr,
1435 t_mdatoms * gmx_restrict mdatoms,
1436 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1437 t_nrnb * gmx_restrict nrnb)
1439 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1440 * just 0 for non-waters.
1441 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1442 * jnr indices corresponding to data put in the four positions in the SIMD register.
1444 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1445 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1446 int jnrA,jnrB,jnrC,jnrD;
1447 int jnrE,jnrF,jnrG,jnrH;
1448 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1449 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1450 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1451 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1452 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1453 real rcutoff_scalar;
1454 real *shiftvec,*fshift,*x,*f;
1455 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1456 real scratch[4*DIM];
1457 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1458 real * vdwioffsetptr1;
1459 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1460 real * vdwioffsetptr2;
1461 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1462 real * vdwioffsetptr3;
1463 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1464 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1465 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1466 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1467 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1468 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1469 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1470 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1471 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1472 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1473 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1474 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1475 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1476 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1477 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1478 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1479 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1482 __m128i vfitab_lo,vfitab_hi;
1483 __m128i ifour = _mm_set1_epi32(4);
1484 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1486 __m256 dummy_mask,cutoff_mask;
1487 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1488 __m256 one = _mm256_set1_ps(1.0);
1489 __m256 two = _mm256_set1_ps(2.0);
1495 jindex = nlist->jindex;
1497 shiftidx = nlist->shift;
1499 shiftvec = fr->shift_vec[0];
1500 fshift = fr->fshift[0];
1501 facel = _mm256_set1_ps(fr->epsfac);
1502 charge = mdatoms->chargeA;
1504 vftab = kernel_data->table_elec->data;
1505 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
1507 /* Setup water-specific parameters */
1508 inr = nlist->iinr[0];
1509 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1510 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1511 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1513 jq1 = _mm256_set1_ps(charge[inr+1]);
1514 jq2 = _mm256_set1_ps(charge[inr+2]);
1515 jq3 = _mm256_set1_ps(charge[inr+3]);
1516 qq11 = _mm256_mul_ps(iq1,jq1);
1517 qq12 = _mm256_mul_ps(iq1,jq2);
1518 qq13 = _mm256_mul_ps(iq1,jq3);
1519 qq21 = _mm256_mul_ps(iq2,jq1);
1520 qq22 = _mm256_mul_ps(iq2,jq2);
1521 qq23 = _mm256_mul_ps(iq2,jq3);
1522 qq31 = _mm256_mul_ps(iq3,jq1);
1523 qq32 = _mm256_mul_ps(iq3,jq2);
1524 qq33 = _mm256_mul_ps(iq3,jq3);
1526 /* Avoid stupid compiler warnings */
1527 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1528 j_coord_offsetA = 0;
1529 j_coord_offsetB = 0;
1530 j_coord_offsetC = 0;
1531 j_coord_offsetD = 0;
1532 j_coord_offsetE = 0;
1533 j_coord_offsetF = 0;
1534 j_coord_offsetG = 0;
1535 j_coord_offsetH = 0;
1540 for(iidx=0;iidx<4*DIM;iidx++)
1542 scratch[iidx] = 0.0;
1545 /* Start outer loop over neighborlists */
1546 for(iidx=0; iidx<nri; iidx++)
1548 /* Load shift vector for this list */
1549 i_shift_offset = DIM*shiftidx[iidx];
1551 /* Load limits for loop over neighbors */
1552 j_index_start = jindex[iidx];
1553 j_index_end = jindex[iidx+1];
1555 /* Get outer coordinate index */
1557 i_coord_offset = DIM*inr;
1559 /* Load i particle coords and add shift vector */
1560 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1561 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1563 fix1 = _mm256_setzero_ps();
1564 fiy1 = _mm256_setzero_ps();
1565 fiz1 = _mm256_setzero_ps();
1566 fix2 = _mm256_setzero_ps();
1567 fiy2 = _mm256_setzero_ps();
1568 fiz2 = _mm256_setzero_ps();
1569 fix3 = _mm256_setzero_ps();
1570 fiy3 = _mm256_setzero_ps();
1571 fiz3 = _mm256_setzero_ps();
1573 /* Start inner kernel loop */
1574 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1577 /* Get j neighbor index, and coordinate index */
1579 jnrB = jjnr[jidx+1];
1580 jnrC = jjnr[jidx+2];
1581 jnrD = jjnr[jidx+3];
1582 jnrE = jjnr[jidx+4];
1583 jnrF = jjnr[jidx+5];
1584 jnrG = jjnr[jidx+6];
1585 jnrH = jjnr[jidx+7];
1586 j_coord_offsetA = DIM*jnrA;
1587 j_coord_offsetB = DIM*jnrB;
1588 j_coord_offsetC = DIM*jnrC;
1589 j_coord_offsetD = DIM*jnrD;
1590 j_coord_offsetE = DIM*jnrE;
1591 j_coord_offsetF = DIM*jnrF;
1592 j_coord_offsetG = DIM*jnrG;
1593 j_coord_offsetH = DIM*jnrH;
1595 /* load j atom coordinates */
1596 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1597 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1598 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1599 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1600 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1602 /* Calculate displacement vector */
1603 dx11 = _mm256_sub_ps(ix1,jx1);
1604 dy11 = _mm256_sub_ps(iy1,jy1);
1605 dz11 = _mm256_sub_ps(iz1,jz1);
1606 dx12 = _mm256_sub_ps(ix1,jx2);
1607 dy12 = _mm256_sub_ps(iy1,jy2);
1608 dz12 = _mm256_sub_ps(iz1,jz2);
1609 dx13 = _mm256_sub_ps(ix1,jx3);
1610 dy13 = _mm256_sub_ps(iy1,jy3);
1611 dz13 = _mm256_sub_ps(iz1,jz3);
1612 dx21 = _mm256_sub_ps(ix2,jx1);
1613 dy21 = _mm256_sub_ps(iy2,jy1);
1614 dz21 = _mm256_sub_ps(iz2,jz1);
1615 dx22 = _mm256_sub_ps(ix2,jx2);
1616 dy22 = _mm256_sub_ps(iy2,jy2);
1617 dz22 = _mm256_sub_ps(iz2,jz2);
1618 dx23 = _mm256_sub_ps(ix2,jx3);
1619 dy23 = _mm256_sub_ps(iy2,jy3);
1620 dz23 = _mm256_sub_ps(iz2,jz3);
1621 dx31 = _mm256_sub_ps(ix3,jx1);
1622 dy31 = _mm256_sub_ps(iy3,jy1);
1623 dz31 = _mm256_sub_ps(iz3,jz1);
1624 dx32 = _mm256_sub_ps(ix3,jx2);
1625 dy32 = _mm256_sub_ps(iy3,jy2);
1626 dz32 = _mm256_sub_ps(iz3,jz2);
1627 dx33 = _mm256_sub_ps(ix3,jx3);
1628 dy33 = _mm256_sub_ps(iy3,jy3);
1629 dz33 = _mm256_sub_ps(iz3,jz3);
1631 /* Calculate squared distance and things based on it */
1632 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1633 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1634 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1635 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1636 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1637 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1638 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1639 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1640 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1642 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1643 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1644 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1645 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1646 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1647 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1648 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1649 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1650 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1652 fjx1 = _mm256_setzero_ps();
1653 fjy1 = _mm256_setzero_ps();
1654 fjz1 = _mm256_setzero_ps();
1655 fjx2 = _mm256_setzero_ps();
1656 fjy2 = _mm256_setzero_ps();
1657 fjz2 = _mm256_setzero_ps();
1658 fjx3 = _mm256_setzero_ps();
1659 fjy3 = _mm256_setzero_ps();
1660 fjz3 = _mm256_setzero_ps();
1662 /**************************
1663 * CALCULATE INTERACTIONS *
1664 **************************/
1666 r11 = _mm256_mul_ps(rsq11,rinv11);
1668 /* Calculate table index by multiplying r with table scale and truncate to integer */
1669 rt = _mm256_mul_ps(r11,vftabscale);
1670 vfitab = _mm256_cvttps_epi32(rt);
1671 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1672 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1673 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1674 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1675 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1676 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1678 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1679 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1680 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1681 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1682 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1683 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1684 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1685 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1686 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1687 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1688 Heps = _mm256_mul_ps(vfeps,H);
1689 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1690 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1691 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1695 /* Calculate temporary vectorial force */
1696 tx = _mm256_mul_ps(fscal,dx11);
1697 ty = _mm256_mul_ps(fscal,dy11);
1698 tz = _mm256_mul_ps(fscal,dz11);
1700 /* Update vectorial force */
1701 fix1 = _mm256_add_ps(fix1,tx);
1702 fiy1 = _mm256_add_ps(fiy1,ty);
1703 fiz1 = _mm256_add_ps(fiz1,tz);
1705 fjx1 = _mm256_add_ps(fjx1,tx);
1706 fjy1 = _mm256_add_ps(fjy1,ty);
1707 fjz1 = _mm256_add_ps(fjz1,tz);
1709 /**************************
1710 * CALCULATE INTERACTIONS *
1711 **************************/
1713 r12 = _mm256_mul_ps(rsq12,rinv12);
1715 /* Calculate table index by multiplying r with table scale and truncate to integer */
1716 rt = _mm256_mul_ps(r12,vftabscale);
1717 vfitab = _mm256_cvttps_epi32(rt);
1718 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1719 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1720 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1721 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1722 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1723 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1725 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1726 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1727 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1728 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1729 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1730 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1731 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1732 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1733 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1734 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1735 Heps = _mm256_mul_ps(vfeps,H);
1736 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1737 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1738 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1742 /* Calculate temporary vectorial force */
1743 tx = _mm256_mul_ps(fscal,dx12);
1744 ty = _mm256_mul_ps(fscal,dy12);
1745 tz = _mm256_mul_ps(fscal,dz12);
1747 /* Update vectorial force */
1748 fix1 = _mm256_add_ps(fix1,tx);
1749 fiy1 = _mm256_add_ps(fiy1,ty);
1750 fiz1 = _mm256_add_ps(fiz1,tz);
1752 fjx2 = _mm256_add_ps(fjx2,tx);
1753 fjy2 = _mm256_add_ps(fjy2,ty);
1754 fjz2 = _mm256_add_ps(fjz2,tz);
1756 /**************************
1757 * CALCULATE INTERACTIONS *
1758 **************************/
1760 r13 = _mm256_mul_ps(rsq13,rinv13);
1762 /* Calculate table index by multiplying r with table scale and truncate to integer */
1763 rt = _mm256_mul_ps(r13,vftabscale);
1764 vfitab = _mm256_cvttps_epi32(rt);
1765 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1766 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1767 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1768 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1769 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1770 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1772 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1773 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1774 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1775 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1776 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1777 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1778 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1779 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1780 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1781 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1782 Heps = _mm256_mul_ps(vfeps,H);
1783 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1784 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1785 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
1789 /* Calculate temporary vectorial force */
1790 tx = _mm256_mul_ps(fscal,dx13);
1791 ty = _mm256_mul_ps(fscal,dy13);
1792 tz = _mm256_mul_ps(fscal,dz13);
1794 /* Update vectorial force */
1795 fix1 = _mm256_add_ps(fix1,tx);
1796 fiy1 = _mm256_add_ps(fiy1,ty);
1797 fiz1 = _mm256_add_ps(fiz1,tz);
1799 fjx3 = _mm256_add_ps(fjx3,tx);
1800 fjy3 = _mm256_add_ps(fjy3,ty);
1801 fjz3 = _mm256_add_ps(fjz3,tz);
1803 /**************************
1804 * CALCULATE INTERACTIONS *
1805 **************************/
1807 r21 = _mm256_mul_ps(rsq21,rinv21);
1809 /* Calculate table index by multiplying r with table scale and truncate to integer */
1810 rt = _mm256_mul_ps(r21,vftabscale);
1811 vfitab = _mm256_cvttps_epi32(rt);
1812 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1813 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1814 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1815 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1816 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1817 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1819 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1820 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1821 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1822 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1823 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1824 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1825 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1826 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1827 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1828 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1829 Heps = _mm256_mul_ps(vfeps,H);
1830 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1831 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1832 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1836 /* Calculate temporary vectorial force */
1837 tx = _mm256_mul_ps(fscal,dx21);
1838 ty = _mm256_mul_ps(fscal,dy21);
1839 tz = _mm256_mul_ps(fscal,dz21);
1841 /* Update vectorial force */
1842 fix2 = _mm256_add_ps(fix2,tx);
1843 fiy2 = _mm256_add_ps(fiy2,ty);
1844 fiz2 = _mm256_add_ps(fiz2,tz);
1846 fjx1 = _mm256_add_ps(fjx1,tx);
1847 fjy1 = _mm256_add_ps(fjy1,ty);
1848 fjz1 = _mm256_add_ps(fjz1,tz);
1850 /**************************
1851 * CALCULATE INTERACTIONS *
1852 **************************/
1854 r22 = _mm256_mul_ps(rsq22,rinv22);
1856 /* Calculate table index by multiplying r with table scale and truncate to integer */
1857 rt = _mm256_mul_ps(r22,vftabscale);
1858 vfitab = _mm256_cvttps_epi32(rt);
1859 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1860 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1861 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1862 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1863 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1864 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1866 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1867 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1868 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1869 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1870 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1871 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1872 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1873 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1874 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1875 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1876 Heps = _mm256_mul_ps(vfeps,H);
1877 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1878 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1879 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1883 /* Calculate temporary vectorial force */
1884 tx = _mm256_mul_ps(fscal,dx22);
1885 ty = _mm256_mul_ps(fscal,dy22);
1886 tz = _mm256_mul_ps(fscal,dz22);
1888 /* Update vectorial force */
1889 fix2 = _mm256_add_ps(fix2,tx);
1890 fiy2 = _mm256_add_ps(fiy2,ty);
1891 fiz2 = _mm256_add_ps(fiz2,tz);
1893 fjx2 = _mm256_add_ps(fjx2,tx);
1894 fjy2 = _mm256_add_ps(fjy2,ty);
1895 fjz2 = _mm256_add_ps(fjz2,tz);
1897 /**************************
1898 * CALCULATE INTERACTIONS *
1899 **************************/
1901 r23 = _mm256_mul_ps(rsq23,rinv23);
1903 /* Calculate table index by multiplying r with table scale and truncate to integer */
1904 rt = _mm256_mul_ps(r23,vftabscale);
1905 vfitab = _mm256_cvttps_epi32(rt);
1906 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1907 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1908 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1909 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1910 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1911 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1913 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1914 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1915 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1916 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1917 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1918 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1919 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1920 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1921 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1922 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1923 Heps = _mm256_mul_ps(vfeps,H);
1924 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1925 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1926 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
1930 /* Calculate temporary vectorial force */
1931 tx = _mm256_mul_ps(fscal,dx23);
1932 ty = _mm256_mul_ps(fscal,dy23);
1933 tz = _mm256_mul_ps(fscal,dz23);
1935 /* Update vectorial force */
1936 fix2 = _mm256_add_ps(fix2,tx);
1937 fiy2 = _mm256_add_ps(fiy2,ty);
1938 fiz2 = _mm256_add_ps(fiz2,tz);
1940 fjx3 = _mm256_add_ps(fjx3,tx);
1941 fjy3 = _mm256_add_ps(fjy3,ty);
1942 fjz3 = _mm256_add_ps(fjz3,tz);
1944 /**************************
1945 * CALCULATE INTERACTIONS *
1946 **************************/
1948 r31 = _mm256_mul_ps(rsq31,rinv31);
1950 /* Calculate table index by multiplying r with table scale and truncate to integer */
1951 rt = _mm256_mul_ps(r31,vftabscale);
1952 vfitab = _mm256_cvttps_epi32(rt);
1953 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1954 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1955 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1956 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1957 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1958 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1960 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1961 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1962 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1963 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1964 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1965 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1966 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1967 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1968 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1969 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1970 Heps = _mm256_mul_ps(vfeps,H);
1971 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1972 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1973 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
1977 /* Calculate temporary vectorial force */
1978 tx = _mm256_mul_ps(fscal,dx31);
1979 ty = _mm256_mul_ps(fscal,dy31);
1980 tz = _mm256_mul_ps(fscal,dz31);
1982 /* Update vectorial force */
1983 fix3 = _mm256_add_ps(fix3,tx);
1984 fiy3 = _mm256_add_ps(fiy3,ty);
1985 fiz3 = _mm256_add_ps(fiz3,tz);
1987 fjx1 = _mm256_add_ps(fjx1,tx);
1988 fjy1 = _mm256_add_ps(fjy1,ty);
1989 fjz1 = _mm256_add_ps(fjz1,tz);
1991 /**************************
1992 * CALCULATE INTERACTIONS *
1993 **************************/
1995 r32 = _mm256_mul_ps(rsq32,rinv32);
1997 /* Calculate table index by multiplying r with table scale and truncate to integer */
1998 rt = _mm256_mul_ps(r32,vftabscale);
1999 vfitab = _mm256_cvttps_epi32(rt);
2000 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2001 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2002 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2003 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2004 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2005 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2007 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2008 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2009 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2010 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2011 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2012 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2013 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2014 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2015 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2016 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2017 Heps = _mm256_mul_ps(vfeps,H);
2018 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2019 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2020 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2024 /* Calculate temporary vectorial force */
2025 tx = _mm256_mul_ps(fscal,dx32);
2026 ty = _mm256_mul_ps(fscal,dy32);
2027 tz = _mm256_mul_ps(fscal,dz32);
2029 /* Update vectorial force */
2030 fix3 = _mm256_add_ps(fix3,tx);
2031 fiy3 = _mm256_add_ps(fiy3,ty);
2032 fiz3 = _mm256_add_ps(fiz3,tz);
2034 fjx2 = _mm256_add_ps(fjx2,tx);
2035 fjy2 = _mm256_add_ps(fjy2,ty);
2036 fjz2 = _mm256_add_ps(fjz2,tz);
2038 /**************************
2039 * CALCULATE INTERACTIONS *
2040 **************************/
2042 r33 = _mm256_mul_ps(rsq33,rinv33);
2044 /* Calculate table index by multiplying r with table scale and truncate to integer */
2045 rt = _mm256_mul_ps(r33,vftabscale);
2046 vfitab = _mm256_cvttps_epi32(rt);
2047 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2048 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2049 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2050 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2051 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2052 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2054 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2055 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2056 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2057 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2058 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2059 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2060 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2061 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2062 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2063 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2064 Heps = _mm256_mul_ps(vfeps,H);
2065 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2066 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2067 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2071 /* Calculate temporary vectorial force */
2072 tx = _mm256_mul_ps(fscal,dx33);
2073 ty = _mm256_mul_ps(fscal,dy33);
2074 tz = _mm256_mul_ps(fscal,dz33);
2076 /* Update vectorial force */
2077 fix3 = _mm256_add_ps(fix3,tx);
2078 fiy3 = _mm256_add_ps(fiy3,ty);
2079 fiz3 = _mm256_add_ps(fiz3,tz);
2081 fjx3 = _mm256_add_ps(fjx3,tx);
2082 fjy3 = _mm256_add_ps(fjy3,ty);
2083 fjz3 = _mm256_add_ps(fjz3,tz);
2085 fjptrA = f+j_coord_offsetA;
2086 fjptrB = f+j_coord_offsetB;
2087 fjptrC = f+j_coord_offsetC;
2088 fjptrD = f+j_coord_offsetD;
2089 fjptrE = f+j_coord_offsetE;
2090 fjptrF = f+j_coord_offsetF;
2091 fjptrG = f+j_coord_offsetG;
2092 fjptrH = f+j_coord_offsetH;
2094 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2095 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2096 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2098 /* Inner loop uses 351 flops */
2101 if(jidx<j_index_end)
2104 /* Get j neighbor index, and coordinate index */
2105 jnrlistA = jjnr[jidx];
2106 jnrlistB = jjnr[jidx+1];
2107 jnrlistC = jjnr[jidx+2];
2108 jnrlistD = jjnr[jidx+3];
2109 jnrlistE = jjnr[jidx+4];
2110 jnrlistF = jjnr[jidx+5];
2111 jnrlistG = jjnr[jidx+6];
2112 jnrlistH = jjnr[jidx+7];
2113 /* Sign of each element will be negative for non-real atoms.
2114 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2115 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2117 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2118 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2120 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2121 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2122 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2123 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2124 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2125 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2126 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2127 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2128 j_coord_offsetA = DIM*jnrA;
2129 j_coord_offsetB = DIM*jnrB;
2130 j_coord_offsetC = DIM*jnrC;
2131 j_coord_offsetD = DIM*jnrD;
2132 j_coord_offsetE = DIM*jnrE;
2133 j_coord_offsetF = DIM*jnrF;
2134 j_coord_offsetG = DIM*jnrG;
2135 j_coord_offsetH = DIM*jnrH;
2137 /* load j atom coordinates */
2138 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2139 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2140 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
2141 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
2142 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2144 /* Calculate displacement vector */
2145 dx11 = _mm256_sub_ps(ix1,jx1);
2146 dy11 = _mm256_sub_ps(iy1,jy1);
2147 dz11 = _mm256_sub_ps(iz1,jz1);
2148 dx12 = _mm256_sub_ps(ix1,jx2);
2149 dy12 = _mm256_sub_ps(iy1,jy2);
2150 dz12 = _mm256_sub_ps(iz1,jz2);
2151 dx13 = _mm256_sub_ps(ix1,jx3);
2152 dy13 = _mm256_sub_ps(iy1,jy3);
2153 dz13 = _mm256_sub_ps(iz1,jz3);
2154 dx21 = _mm256_sub_ps(ix2,jx1);
2155 dy21 = _mm256_sub_ps(iy2,jy1);
2156 dz21 = _mm256_sub_ps(iz2,jz1);
2157 dx22 = _mm256_sub_ps(ix2,jx2);
2158 dy22 = _mm256_sub_ps(iy2,jy2);
2159 dz22 = _mm256_sub_ps(iz2,jz2);
2160 dx23 = _mm256_sub_ps(ix2,jx3);
2161 dy23 = _mm256_sub_ps(iy2,jy3);
2162 dz23 = _mm256_sub_ps(iz2,jz3);
2163 dx31 = _mm256_sub_ps(ix3,jx1);
2164 dy31 = _mm256_sub_ps(iy3,jy1);
2165 dz31 = _mm256_sub_ps(iz3,jz1);
2166 dx32 = _mm256_sub_ps(ix3,jx2);
2167 dy32 = _mm256_sub_ps(iy3,jy2);
2168 dz32 = _mm256_sub_ps(iz3,jz2);
2169 dx33 = _mm256_sub_ps(ix3,jx3);
2170 dy33 = _mm256_sub_ps(iy3,jy3);
2171 dz33 = _mm256_sub_ps(iz3,jz3);
2173 /* Calculate squared distance and things based on it */
2174 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2175 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2176 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2177 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2178 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2179 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2180 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2181 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2182 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2184 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2185 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2186 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2187 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2188 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2189 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2190 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2191 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2192 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2194 fjx1 = _mm256_setzero_ps();
2195 fjy1 = _mm256_setzero_ps();
2196 fjz1 = _mm256_setzero_ps();
2197 fjx2 = _mm256_setzero_ps();
2198 fjy2 = _mm256_setzero_ps();
2199 fjz2 = _mm256_setzero_ps();
2200 fjx3 = _mm256_setzero_ps();
2201 fjy3 = _mm256_setzero_ps();
2202 fjz3 = _mm256_setzero_ps();
2204 /**************************
2205 * CALCULATE INTERACTIONS *
2206 **************************/
2208 r11 = _mm256_mul_ps(rsq11,rinv11);
2209 r11 = _mm256_andnot_ps(dummy_mask,r11);
2211 /* Calculate table index by multiplying r with table scale and truncate to integer */
2212 rt = _mm256_mul_ps(r11,vftabscale);
2213 vfitab = _mm256_cvttps_epi32(rt);
2214 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2215 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2216 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2217 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2218 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2219 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2221 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2222 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2223 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2224 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2225 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2226 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2227 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2228 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2229 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2230 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2231 Heps = _mm256_mul_ps(vfeps,H);
2232 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2233 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2234 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2238 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2240 /* Calculate temporary vectorial force */
2241 tx = _mm256_mul_ps(fscal,dx11);
2242 ty = _mm256_mul_ps(fscal,dy11);
2243 tz = _mm256_mul_ps(fscal,dz11);
2245 /* Update vectorial force */
2246 fix1 = _mm256_add_ps(fix1,tx);
2247 fiy1 = _mm256_add_ps(fiy1,ty);
2248 fiz1 = _mm256_add_ps(fiz1,tz);
2250 fjx1 = _mm256_add_ps(fjx1,tx);
2251 fjy1 = _mm256_add_ps(fjy1,ty);
2252 fjz1 = _mm256_add_ps(fjz1,tz);
2254 /**************************
2255 * CALCULATE INTERACTIONS *
2256 **************************/
2258 r12 = _mm256_mul_ps(rsq12,rinv12);
2259 r12 = _mm256_andnot_ps(dummy_mask,r12);
2261 /* Calculate table index by multiplying r with table scale and truncate to integer */
2262 rt = _mm256_mul_ps(r12,vftabscale);
2263 vfitab = _mm256_cvttps_epi32(rt);
2264 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2265 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2266 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2267 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2268 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2269 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2271 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2272 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2273 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2274 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2275 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2276 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2277 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2278 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2279 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2280 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2281 Heps = _mm256_mul_ps(vfeps,H);
2282 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2283 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2284 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2288 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2290 /* Calculate temporary vectorial force */
2291 tx = _mm256_mul_ps(fscal,dx12);
2292 ty = _mm256_mul_ps(fscal,dy12);
2293 tz = _mm256_mul_ps(fscal,dz12);
2295 /* Update vectorial force */
2296 fix1 = _mm256_add_ps(fix1,tx);
2297 fiy1 = _mm256_add_ps(fiy1,ty);
2298 fiz1 = _mm256_add_ps(fiz1,tz);
2300 fjx2 = _mm256_add_ps(fjx2,tx);
2301 fjy2 = _mm256_add_ps(fjy2,ty);
2302 fjz2 = _mm256_add_ps(fjz2,tz);
2304 /**************************
2305 * CALCULATE INTERACTIONS *
2306 **************************/
2308 r13 = _mm256_mul_ps(rsq13,rinv13);
2309 r13 = _mm256_andnot_ps(dummy_mask,r13);
2311 /* Calculate table index by multiplying r with table scale and truncate to integer */
2312 rt = _mm256_mul_ps(r13,vftabscale);
2313 vfitab = _mm256_cvttps_epi32(rt);
2314 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2315 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2316 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2317 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2318 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2319 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2321 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2322 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2323 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2324 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2325 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2326 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2327 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2328 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2329 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2330 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2331 Heps = _mm256_mul_ps(vfeps,H);
2332 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2333 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2334 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
2338 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2340 /* Calculate temporary vectorial force */
2341 tx = _mm256_mul_ps(fscal,dx13);
2342 ty = _mm256_mul_ps(fscal,dy13);
2343 tz = _mm256_mul_ps(fscal,dz13);
2345 /* Update vectorial force */
2346 fix1 = _mm256_add_ps(fix1,tx);
2347 fiy1 = _mm256_add_ps(fiy1,ty);
2348 fiz1 = _mm256_add_ps(fiz1,tz);
2350 fjx3 = _mm256_add_ps(fjx3,tx);
2351 fjy3 = _mm256_add_ps(fjy3,ty);
2352 fjz3 = _mm256_add_ps(fjz3,tz);
2354 /**************************
2355 * CALCULATE INTERACTIONS *
2356 **************************/
2358 r21 = _mm256_mul_ps(rsq21,rinv21);
2359 r21 = _mm256_andnot_ps(dummy_mask,r21);
2361 /* Calculate table index by multiplying r with table scale and truncate to integer */
2362 rt = _mm256_mul_ps(r21,vftabscale);
2363 vfitab = _mm256_cvttps_epi32(rt);
2364 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2365 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2366 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2367 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2368 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2369 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2371 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2372 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2373 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2374 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2375 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2376 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2377 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2378 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2379 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2380 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2381 Heps = _mm256_mul_ps(vfeps,H);
2382 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2383 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2384 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2388 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2390 /* Calculate temporary vectorial force */
2391 tx = _mm256_mul_ps(fscal,dx21);
2392 ty = _mm256_mul_ps(fscal,dy21);
2393 tz = _mm256_mul_ps(fscal,dz21);
2395 /* Update vectorial force */
2396 fix2 = _mm256_add_ps(fix2,tx);
2397 fiy2 = _mm256_add_ps(fiy2,ty);
2398 fiz2 = _mm256_add_ps(fiz2,tz);
2400 fjx1 = _mm256_add_ps(fjx1,tx);
2401 fjy1 = _mm256_add_ps(fjy1,ty);
2402 fjz1 = _mm256_add_ps(fjz1,tz);
2404 /**************************
2405 * CALCULATE INTERACTIONS *
2406 **************************/
2408 r22 = _mm256_mul_ps(rsq22,rinv22);
2409 r22 = _mm256_andnot_ps(dummy_mask,r22);
2411 /* Calculate table index by multiplying r with table scale and truncate to integer */
2412 rt = _mm256_mul_ps(r22,vftabscale);
2413 vfitab = _mm256_cvttps_epi32(rt);
2414 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2415 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2416 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2417 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2418 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2419 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2421 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2422 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2423 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2424 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2425 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2426 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2427 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2428 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2429 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2430 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2431 Heps = _mm256_mul_ps(vfeps,H);
2432 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2433 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2434 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2438 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2440 /* Calculate temporary vectorial force */
2441 tx = _mm256_mul_ps(fscal,dx22);
2442 ty = _mm256_mul_ps(fscal,dy22);
2443 tz = _mm256_mul_ps(fscal,dz22);
2445 /* Update vectorial force */
2446 fix2 = _mm256_add_ps(fix2,tx);
2447 fiy2 = _mm256_add_ps(fiy2,ty);
2448 fiz2 = _mm256_add_ps(fiz2,tz);
2450 fjx2 = _mm256_add_ps(fjx2,tx);
2451 fjy2 = _mm256_add_ps(fjy2,ty);
2452 fjz2 = _mm256_add_ps(fjz2,tz);
2454 /**************************
2455 * CALCULATE INTERACTIONS *
2456 **************************/
2458 r23 = _mm256_mul_ps(rsq23,rinv23);
2459 r23 = _mm256_andnot_ps(dummy_mask,r23);
2461 /* Calculate table index by multiplying r with table scale and truncate to integer */
2462 rt = _mm256_mul_ps(r23,vftabscale);
2463 vfitab = _mm256_cvttps_epi32(rt);
2464 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2465 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2466 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2467 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2468 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2469 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2471 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2472 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2473 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2474 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2475 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2476 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2477 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2478 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2479 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2480 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2481 Heps = _mm256_mul_ps(vfeps,H);
2482 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2483 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2484 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
2488 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2490 /* Calculate temporary vectorial force */
2491 tx = _mm256_mul_ps(fscal,dx23);
2492 ty = _mm256_mul_ps(fscal,dy23);
2493 tz = _mm256_mul_ps(fscal,dz23);
2495 /* Update vectorial force */
2496 fix2 = _mm256_add_ps(fix2,tx);
2497 fiy2 = _mm256_add_ps(fiy2,ty);
2498 fiz2 = _mm256_add_ps(fiz2,tz);
2500 fjx3 = _mm256_add_ps(fjx3,tx);
2501 fjy3 = _mm256_add_ps(fjy3,ty);
2502 fjz3 = _mm256_add_ps(fjz3,tz);
2504 /**************************
2505 * CALCULATE INTERACTIONS *
2506 **************************/
2508 r31 = _mm256_mul_ps(rsq31,rinv31);
2509 r31 = _mm256_andnot_ps(dummy_mask,r31);
2511 /* Calculate table index by multiplying r with table scale and truncate to integer */
2512 rt = _mm256_mul_ps(r31,vftabscale);
2513 vfitab = _mm256_cvttps_epi32(rt);
2514 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2515 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2516 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2517 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2518 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2519 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2521 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2522 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2523 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2524 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2525 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2526 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2527 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2528 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2529 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2530 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2531 Heps = _mm256_mul_ps(vfeps,H);
2532 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2533 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2534 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
2538 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2540 /* Calculate temporary vectorial force */
2541 tx = _mm256_mul_ps(fscal,dx31);
2542 ty = _mm256_mul_ps(fscal,dy31);
2543 tz = _mm256_mul_ps(fscal,dz31);
2545 /* Update vectorial force */
2546 fix3 = _mm256_add_ps(fix3,tx);
2547 fiy3 = _mm256_add_ps(fiy3,ty);
2548 fiz3 = _mm256_add_ps(fiz3,tz);
2550 fjx1 = _mm256_add_ps(fjx1,tx);
2551 fjy1 = _mm256_add_ps(fjy1,ty);
2552 fjz1 = _mm256_add_ps(fjz1,tz);
2554 /**************************
2555 * CALCULATE INTERACTIONS *
2556 **************************/
2558 r32 = _mm256_mul_ps(rsq32,rinv32);
2559 r32 = _mm256_andnot_ps(dummy_mask,r32);
2561 /* Calculate table index by multiplying r with table scale and truncate to integer */
2562 rt = _mm256_mul_ps(r32,vftabscale);
2563 vfitab = _mm256_cvttps_epi32(rt);
2564 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2565 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2566 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2567 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2568 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2569 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2571 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2572 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2573 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2574 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2575 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2576 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2577 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2578 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2579 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2580 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2581 Heps = _mm256_mul_ps(vfeps,H);
2582 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2583 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2584 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2588 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2590 /* Calculate temporary vectorial force */
2591 tx = _mm256_mul_ps(fscal,dx32);
2592 ty = _mm256_mul_ps(fscal,dy32);
2593 tz = _mm256_mul_ps(fscal,dz32);
2595 /* Update vectorial force */
2596 fix3 = _mm256_add_ps(fix3,tx);
2597 fiy3 = _mm256_add_ps(fiy3,ty);
2598 fiz3 = _mm256_add_ps(fiz3,tz);
2600 fjx2 = _mm256_add_ps(fjx2,tx);
2601 fjy2 = _mm256_add_ps(fjy2,ty);
2602 fjz2 = _mm256_add_ps(fjz2,tz);
2604 /**************************
2605 * CALCULATE INTERACTIONS *
2606 **************************/
2608 r33 = _mm256_mul_ps(rsq33,rinv33);
2609 r33 = _mm256_andnot_ps(dummy_mask,r33);
2611 /* Calculate table index by multiplying r with table scale and truncate to integer */
2612 rt = _mm256_mul_ps(r33,vftabscale);
2613 vfitab = _mm256_cvttps_epi32(rt);
2614 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2615 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2616 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2617 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2618 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2619 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2621 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2622 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2623 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2624 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2625 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2626 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2627 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2628 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2629 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2630 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2631 Heps = _mm256_mul_ps(vfeps,H);
2632 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2633 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2634 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2638 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2640 /* Calculate temporary vectorial force */
2641 tx = _mm256_mul_ps(fscal,dx33);
2642 ty = _mm256_mul_ps(fscal,dy33);
2643 tz = _mm256_mul_ps(fscal,dz33);
2645 /* Update vectorial force */
2646 fix3 = _mm256_add_ps(fix3,tx);
2647 fiy3 = _mm256_add_ps(fiy3,ty);
2648 fiz3 = _mm256_add_ps(fiz3,tz);
2650 fjx3 = _mm256_add_ps(fjx3,tx);
2651 fjy3 = _mm256_add_ps(fjy3,ty);
2652 fjz3 = _mm256_add_ps(fjz3,tz);
2654 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2655 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2656 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2657 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2658 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2659 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2660 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2661 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2663 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2664 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2665 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2667 /* Inner loop uses 360 flops */
2670 /* End of innermost loop */
2672 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2673 f+i_coord_offset+DIM,fshift+i_shift_offset);
2675 /* Increment number of inner iterations */
2676 inneriter += j_index_end - j_index_start;
2678 /* Outer loop uses 18 flops */
2681 /* Increment number of outer iterations */
2684 /* Update outer/inner flops */
2686 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*360);