2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "types/simple.h"
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_256_single
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: None
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr1;
89 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 real * vdwioffsetptr3;
93 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
95 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
97 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
99 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
100 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
112 __m128i vfitab_lo,vfitab_hi;
113 __m128i ifour = _mm_set1_epi32(4);
114 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
116 __m256 dummy_mask,cutoff_mask;
117 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
118 __m256 one = _mm256_set1_ps(1.0);
119 __m256 two = _mm256_set1_ps(2.0);
125 jindex = nlist->jindex;
127 shiftidx = nlist->shift;
129 shiftvec = fr->shift_vec[0];
130 fshift = fr->fshift[0];
131 facel = _mm256_set1_ps(fr->epsfac);
132 charge = mdatoms->chargeA;
134 vftab = kernel_data->table_elec->data;
135 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
137 /* Setup water-specific parameters */
138 inr = nlist->iinr[0];
139 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
140 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
141 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
143 jq1 = _mm256_set1_ps(charge[inr+1]);
144 jq2 = _mm256_set1_ps(charge[inr+2]);
145 jq3 = _mm256_set1_ps(charge[inr+3]);
146 qq11 = _mm256_mul_ps(iq1,jq1);
147 qq12 = _mm256_mul_ps(iq1,jq2);
148 qq13 = _mm256_mul_ps(iq1,jq3);
149 qq21 = _mm256_mul_ps(iq2,jq1);
150 qq22 = _mm256_mul_ps(iq2,jq2);
151 qq23 = _mm256_mul_ps(iq2,jq3);
152 qq31 = _mm256_mul_ps(iq3,jq1);
153 qq32 = _mm256_mul_ps(iq3,jq2);
154 qq33 = _mm256_mul_ps(iq3,jq3);
156 /* Avoid stupid compiler warnings */
157 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
170 for(iidx=0;iidx<4*DIM;iidx++)
175 /* Start outer loop over neighborlists */
176 for(iidx=0; iidx<nri; iidx++)
178 /* Load shift vector for this list */
179 i_shift_offset = DIM*shiftidx[iidx];
181 /* Load limits for loop over neighbors */
182 j_index_start = jindex[iidx];
183 j_index_end = jindex[iidx+1];
185 /* Get outer coordinate index */
187 i_coord_offset = DIM*inr;
189 /* Load i particle coords and add shift vector */
190 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
191 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
193 fix1 = _mm256_setzero_ps();
194 fiy1 = _mm256_setzero_ps();
195 fiz1 = _mm256_setzero_ps();
196 fix2 = _mm256_setzero_ps();
197 fiy2 = _mm256_setzero_ps();
198 fiz2 = _mm256_setzero_ps();
199 fix3 = _mm256_setzero_ps();
200 fiy3 = _mm256_setzero_ps();
201 fiz3 = _mm256_setzero_ps();
203 /* Reset potential sums */
204 velecsum = _mm256_setzero_ps();
206 /* Start inner kernel loop */
207 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
210 /* Get j neighbor index, and coordinate index */
219 j_coord_offsetA = DIM*jnrA;
220 j_coord_offsetB = DIM*jnrB;
221 j_coord_offsetC = DIM*jnrC;
222 j_coord_offsetD = DIM*jnrD;
223 j_coord_offsetE = DIM*jnrE;
224 j_coord_offsetF = DIM*jnrF;
225 j_coord_offsetG = DIM*jnrG;
226 j_coord_offsetH = DIM*jnrH;
228 /* load j atom coordinates */
229 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
230 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
231 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
232 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
233 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
235 /* Calculate displacement vector */
236 dx11 = _mm256_sub_ps(ix1,jx1);
237 dy11 = _mm256_sub_ps(iy1,jy1);
238 dz11 = _mm256_sub_ps(iz1,jz1);
239 dx12 = _mm256_sub_ps(ix1,jx2);
240 dy12 = _mm256_sub_ps(iy1,jy2);
241 dz12 = _mm256_sub_ps(iz1,jz2);
242 dx13 = _mm256_sub_ps(ix1,jx3);
243 dy13 = _mm256_sub_ps(iy1,jy3);
244 dz13 = _mm256_sub_ps(iz1,jz3);
245 dx21 = _mm256_sub_ps(ix2,jx1);
246 dy21 = _mm256_sub_ps(iy2,jy1);
247 dz21 = _mm256_sub_ps(iz2,jz1);
248 dx22 = _mm256_sub_ps(ix2,jx2);
249 dy22 = _mm256_sub_ps(iy2,jy2);
250 dz22 = _mm256_sub_ps(iz2,jz2);
251 dx23 = _mm256_sub_ps(ix2,jx3);
252 dy23 = _mm256_sub_ps(iy2,jy3);
253 dz23 = _mm256_sub_ps(iz2,jz3);
254 dx31 = _mm256_sub_ps(ix3,jx1);
255 dy31 = _mm256_sub_ps(iy3,jy1);
256 dz31 = _mm256_sub_ps(iz3,jz1);
257 dx32 = _mm256_sub_ps(ix3,jx2);
258 dy32 = _mm256_sub_ps(iy3,jy2);
259 dz32 = _mm256_sub_ps(iz3,jz2);
260 dx33 = _mm256_sub_ps(ix3,jx3);
261 dy33 = _mm256_sub_ps(iy3,jy3);
262 dz33 = _mm256_sub_ps(iz3,jz3);
264 /* Calculate squared distance and things based on it */
265 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
266 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
267 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
268 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
269 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
270 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
271 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
272 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
273 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
275 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
276 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
277 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
278 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
279 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
280 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
281 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
282 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
283 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
285 fjx1 = _mm256_setzero_ps();
286 fjy1 = _mm256_setzero_ps();
287 fjz1 = _mm256_setzero_ps();
288 fjx2 = _mm256_setzero_ps();
289 fjy2 = _mm256_setzero_ps();
290 fjz2 = _mm256_setzero_ps();
291 fjx3 = _mm256_setzero_ps();
292 fjy3 = _mm256_setzero_ps();
293 fjz3 = _mm256_setzero_ps();
295 /**************************
296 * CALCULATE INTERACTIONS *
297 **************************/
299 r11 = _mm256_mul_ps(rsq11,rinv11);
301 /* Calculate table index by multiplying r with table scale and truncate to integer */
302 rt = _mm256_mul_ps(r11,vftabscale);
303 vfitab = _mm256_cvttps_epi32(rt);
304 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
305 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
306 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
307 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
308 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
309 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
311 /* CUBIC SPLINE TABLE ELECTROSTATICS */
312 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
313 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
314 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
315 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
316 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
317 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
318 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
319 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
320 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
321 Heps = _mm256_mul_ps(vfeps,H);
322 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
323 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
324 velec = _mm256_mul_ps(qq11,VV);
325 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
326 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
328 /* Update potential sum for this i atom from the interaction with this j atom. */
329 velecsum = _mm256_add_ps(velecsum,velec);
333 /* Calculate temporary vectorial force */
334 tx = _mm256_mul_ps(fscal,dx11);
335 ty = _mm256_mul_ps(fscal,dy11);
336 tz = _mm256_mul_ps(fscal,dz11);
338 /* Update vectorial force */
339 fix1 = _mm256_add_ps(fix1,tx);
340 fiy1 = _mm256_add_ps(fiy1,ty);
341 fiz1 = _mm256_add_ps(fiz1,tz);
343 fjx1 = _mm256_add_ps(fjx1,tx);
344 fjy1 = _mm256_add_ps(fjy1,ty);
345 fjz1 = _mm256_add_ps(fjz1,tz);
347 /**************************
348 * CALCULATE INTERACTIONS *
349 **************************/
351 r12 = _mm256_mul_ps(rsq12,rinv12);
353 /* Calculate table index by multiplying r with table scale and truncate to integer */
354 rt = _mm256_mul_ps(r12,vftabscale);
355 vfitab = _mm256_cvttps_epi32(rt);
356 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
357 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
358 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
359 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
360 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
361 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
363 /* CUBIC SPLINE TABLE ELECTROSTATICS */
364 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
365 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
366 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
367 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
368 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
369 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
370 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
371 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
372 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
373 Heps = _mm256_mul_ps(vfeps,H);
374 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
375 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
376 velec = _mm256_mul_ps(qq12,VV);
377 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
378 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
380 /* Update potential sum for this i atom from the interaction with this j atom. */
381 velecsum = _mm256_add_ps(velecsum,velec);
385 /* Calculate temporary vectorial force */
386 tx = _mm256_mul_ps(fscal,dx12);
387 ty = _mm256_mul_ps(fscal,dy12);
388 tz = _mm256_mul_ps(fscal,dz12);
390 /* Update vectorial force */
391 fix1 = _mm256_add_ps(fix1,tx);
392 fiy1 = _mm256_add_ps(fiy1,ty);
393 fiz1 = _mm256_add_ps(fiz1,tz);
395 fjx2 = _mm256_add_ps(fjx2,tx);
396 fjy2 = _mm256_add_ps(fjy2,ty);
397 fjz2 = _mm256_add_ps(fjz2,tz);
399 /**************************
400 * CALCULATE INTERACTIONS *
401 **************************/
403 r13 = _mm256_mul_ps(rsq13,rinv13);
405 /* Calculate table index by multiplying r with table scale and truncate to integer */
406 rt = _mm256_mul_ps(r13,vftabscale);
407 vfitab = _mm256_cvttps_epi32(rt);
408 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
409 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
410 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
411 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
412 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
413 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
415 /* CUBIC SPLINE TABLE ELECTROSTATICS */
416 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
417 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
418 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
419 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
420 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
421 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
422 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
423 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
424 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
425 Heps = _mm256_mul_ps(vfeps,H);
426 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
427 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
428 velec = _mm256_mul_ps(qq13,VV);
429 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
430 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velecsum = _mm256_add_ps(velecsum,velec);
437 /* Calculate temporary vectorial force */
438 tx = _mm256_mul_ps(fscal,dx13);
439 ty = _mm256_mul_ps(fscal,dy13);
440 tz = _mm256_mul_ps(fscal,dz13);
442 /* Update vectorial force */
443 fix1 = _mm256_add_ps(fix1,tx);
444 fiy1 = _mm256_add_ps(fiy1,ty);
445 fiz1 = _mm256_add_ps(fiz1,tz);
447 fjx3 = _mm256_add_ps(fjx3,tx);
448 fjy3 = _mm256_add_ps(fjy3,ty);
449 fjz3 = _mm256_add_ps(fjz3,tz);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 r21 = _mm256_mul_ps(rsq21,rinv21);
457 /* Calculate table index by multiplying r with table scale and truncate to integer */
458 rt = _mm256_mul_ps(r21,vftabscale);
459 vfitab = _mm256_cvttps_epi32(rt);
460 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
461 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
462 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
463 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
464 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
465 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
467 /* CUBIC SPLINE TABLE ELECTROSTATICS */
468 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
469 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
470 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
471 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
472 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
473 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
474 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
475 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
476 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
477 Heps = _mm256_mul_ps(vfeps,H);
478 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
479 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
480 velec = _mm256_mul_ps(qq21,VV);
481 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
482 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
484 /* Update potential sum for this i atom from the interaction with this j atom. */
485 velecsum = _mm256_add_ps(velecsum,velec);
489 /* Calculate temporary vectorial force */
490 tx = _mm256_mul_ps(fscal,dx21);
491 ty = _mm256_mul_ps(fscal,dy21);
492 tz = _mm256_mul_ps(fscal,dz21);
494 /* Update vectorial force */
495 fix2 = _mm256_add_ps(fix2,tx);
496 fiy2 = _mm256_add_ps(fiy2,ty);
497 fiz2 = _mm256_add_ps(fiz2,tz);
499 fjx1 = _mm256_add_ps(fjx1,tx);
500 fjy1 = _mm256_add_ps(fjy1,ty);
501 fjz1 = _mm256_add_ps(fjz1,tz);
503 /**************************
504 * CALCULATE INTERACTIONS *
505 **************************/
507 r22 = _mm256_mul_ps(rsq22,rinv22);
509 /* Calculate table index by multiplying r with table scale and truncate to integer */
510 rt = _mm256_mul_ps(r22,vftabscale);
511 vfitab = _mm256_cvttps_epi32(rt);
512 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
513 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
514 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
515 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
516 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
517 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
519 /* CUBIC SPLINE TABLE ELECTROSTATICS */
520 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
521 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
522 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
523 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
524 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
525 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
526 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
527 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
528 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
529 Heps = _mm256_mul_ps(vfeps,H);
530 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
531 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
532 velec = _mm256_mul_ps(qq22,VV);
533 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
534 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
536 /* Update potential sum for this i atom from the interaction with this j atom. */
537 velecsum = _mm256_add_ps(velecsum,velec);
541 /* Calculate temporary vectorial force */
542 tx = _mm256_mul_ps(fscal,dx22);
543 ty = _mm256_mul_ps(fscal,dy22);
544 tz = _mm256_mul_ps(fscal,dz22);
546 /* Update vectorial force */
547 fix2 = _mm256_add_ps(fix2,tx);
548 fiy2 = _mm256_add_ps(fiy2,ty);
549 fiz2 = _mm256_add_ps(fiz2,tz);
551 fjx2 = _mm256_add_ps(fjx2,tx);
552 fjy2 = _mm256_add_ps(fjy2,ty);
553 fjz2 = _mm256_add_ps(fjz2,tz);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 r23 = _mm256_mul_ps(rsq23,rinv23);
561 /* Calculate table index by multiplying r with table scale and truncate to integer */
562 rt = _mm256_mul_ps(r23,vftabscale);
563 vfitab = _mm256_cvttps_epi32(rt);
564 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
565 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
566 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
567 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
568 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
569 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
571 /* CUBIC SPLINE TABLE ELECTROSTATICS */
572 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
573 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
574 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
575 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
576 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
577 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
578 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
579 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
580 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
581 Heps = _mm256_mul_ps(vfeps,H);
582 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
583 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
584 velec = _mm256_mul_ps(qq23,VV);
585 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
586 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
588 /* Update potential sum for this i atom from the interaction with this j atom. */
589 velecsum = _mm256_add_ps(velecsum,velec);
593 /* Calculate temporary vectorial force */
594 tx = _mm256_mul_ps(fscal,dx23);
595 ty = _mm256_mul_ps(fscal,dy23);
596 tz = _mm256_mul_ps(fscal,dz23);
598 /* Update vectorial force */
599 fix2 = _mm256_add_ps(fix2,tx);
600 fiy2 = _mm256_add_ps(fiy2,ty);
601 fiz2 = _mm256_add_ps(fiz2,tz);
603 fjx3 = _mm256_add_ps(fjx3,tx);
604 fjy3 = _mm256_add_ps(fjy3,ty);
605 fjz3 = _mm256_add_ps(fjz3,tz);
607 /**************************
608 * CALCULATE INTERACTIONS *
609 **************************/
611 r31 = _mm256_mul_ps(rsq31,rinv31);
613 /* Calculate table index by multiplying r with table scale and truncate to integer */
614 rt = _mm256_mul_ps(r31,vftabscale);
615 vfitab = _mm256_cvttps_epi32(rt);
616 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
617 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
618 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
619 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
620 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
621 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
623 /* CUBIC SPLINE TABLE ELECTROSTATICS */
624 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
625 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
626 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
627 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
628 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
629 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
630 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
631 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
632 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
633 Heps = _mm256_mul_ps(vfeps,H);
634 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
635 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
636 velec = _mm256_mul_ps(qq31,VV);
637 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
638 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
640 /* Update potential sum for this i atom from the interaction with this j atom. */
641 velecsum = _mm256_add_ps(velecsum,velec);
645 /* Calculate temporary vectorial force */
646 tx = _mm256_mul_ps(fscal,dx31);
647 ty = _mm256_mul_ps(fscal,dy31);
648 tz = _mm256_mul_ps(fscal,dz31);
650 /* Update vectorial force */
651 fix3 = _mm256_add_ps(fix3,tx);
652 fiy3 = _mm256_add_ps(fiy3,ty);
653 fiz3 = _mm256_add_ps(fiz3,tz);
655 fjx1 = _mm256_add_ps(fjx1,tx);
656 fjy1 = _mm256_add_ps(fjy1,ty);
657 fjz1 = _mm256_add_ps(fjz1,tz);
659 /**************************
660 * CALCULATE INTERACTIONS *
661 **************************/
663 r32 = _mm256_mul_ps(rsq32,rinv32);
665 /* Calculate table index by multiplying r with table scale and truncate to integer */
666 rt = _mm256_mul_ps(r32,vftabscale);
667 vfitab = _mm256_cvttps_epi32(rt);
668 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
669 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
670 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
671 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
672 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
673 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
675 /* CUBIC SPLINE TABLE ELECTROSTATICS */
676 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
677 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
678 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
679 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
680 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
681 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
682 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
683 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
684 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
685 Heps = _mm256_mul_ps(vfeps,H);
686 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
687 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
688 velec = _mm256_mul_ps(qq32,VV);
689 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
690 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
692 /* Update potential sum for this i atom from the interaction with this j atom. */
693 velecsum = _mm256_add_ps(velecsum,velec);
697 /* Calculate temporary vectorial force */
698 tx = _mm256_mul_ps(fscal,dx32);
699 ty = _mm256_mul_ps(fscal,dy32);
700 tz = _mm256_mul_ps(fscal,dz32);
702 /* Update vectorial force */
703 fix3 = _mm256_add_ps(fix3,tx);
704 fiy3 = _mm256_add_ps(fiy3,ty);
705 fiz3 = _mm256_add_ps(fiz3,tz);
707 fjx2 = _mm256_add_ps(fjx2,tx);
708 fjy2 = _mm256_add_ps(fjy2,ty);
709 fjz2 = _mm256_add_ps(fjz2,tz);
711 /**************************
712 * CALCULATE INTERACTIONS *
713 **************************/
715 r33 = _mm256_mul_ps(rsq33,rinv33);
717 /* Calculate table index by multiplying r with table scale and truncate to integer */
718 rt = _mm256_mul_ps(r33,vftabscale);
719 vfitab = _mm256_cvttps_epi32(rt);
720 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
721 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
722 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
723 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
724 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
725 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
727 /* CUBIC SPLINE TABLE ELECTROSTATICS */
728 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
729 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
730 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
731 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
732 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
733 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
734 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
735 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
736 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
737 Heps = _mm256_mul_ps(vfeps,H);
738 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
739 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
740 velec = _mm256_mul_ps(qq33,VV);
741 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
742 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
744 /* Update potential sum for this i atom from the interaction with this j atom. */
745 velecsum = _mm256_add_ps(velecsum,velec);
749 /* Calculate temporary vectorial force */
750 tx = _mm256_mul_ps(fscal,dx33);
751 ty = _mm256_mul_ps(fscal,dy33);
752 tz = _mm256_mul_ps(fscal,dz33);
754 /* Update vectorial force */
755 fix3 = _mm256_add_ps(fix3,tx);
756 fiy3 = _mm256_add_ps(fiy3,ty);
757 fiz3 = _mm256_add_ps(fiz3,tz);
759 fjx3 = _mm256_add_ps(fjx3,tx);
760 fjy3 = _mm256_add_ps(fjy3,ty);
761 fjz3 = _mm256_add_ps(fjz3,tz);
763 fjptrA = f+j_coord_offsetA;
764 fjptrB = f+j_coord_offsetB;
765 fjptrC = f+j_coord_offsetC;
766 fjptrD = f+j_coord_offsetD;
767 fjptrE = f+j_coord_offsetE;
768 fjptrF = f+j_coord_offsetF;
769 fjptrG = f+j_coord_offsetG;
770 fjptrH = f+j_coord_offsetH;
772 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
773 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
774 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
776 /* Inner loop uses 387 flops */
782 /* Get j neighbor index, and coordinate index */
783 jnrlistA = jjnr[jidx];
784 jnrlistB = jjnr[jidx+1];
785 jnrlistC = jjnr[jidx+2];
786 jnrlistD = jjnr[jidx+3];
787 jnrlistE = jjnr[jidx+4];
788 jnrlistF = jjnr[jidx+5];
789 jnrlistG = jjnr[jidx+6];
790 jnrlistH = jjnr[jidx+7];
791 /* Sign of each element will be negative for non-real atoms.
792 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
793 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
795 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
796 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
798 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
799 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
800 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
801 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
802 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
803 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
804 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
805 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
806 j_coord_offsetA = DIM*jnrA;
807 j_coord_offsetB = DIM*jnrB;
808 j_coord_offsetC = DIM*jnrC;
809 j_coord_offsetD = DIM*jnrD;
810 j_coord_offsetE = DIM*jnrE;
811 j_coord_offsetF = DIM*jnrF;
812 j_coord_offsetG = DIM*jnrG;
813 j_coord_offsetH = DIM*jnrH;
815 /* load j atom coordinates */
816 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
817 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
818 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
819 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
820 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
822 /* Calculate displacement vector */
823 dx11 = _mm256_sub_ps(ix1,jx1);
824 dy11 = _mm256_sub_ps(iy1,jy1);
825 dz11 = _mm256_sub_ps(iz1,jz1);
826 dx12 = _mm256_sub_ps(ix1,jx2);
827 dy12 = _mm256_sub_ps(iy1,jy2);
828 dz12 = _mm256_sub_ps(iz1,jz2);
829 dx13 = _mm256_sub_ps(ix1,jx3);
830 dy13 = _mm256_sub_ps(iy1,jy3);
831 dz13 = _mm256_sub_ps(iz1,jz3);
832 dx21 = _mm256_sub_ps(ix2,jx1);
833 dy21 = _mm256_sub_ps(iy2,jy1);
834 dz21 = _mm256_sub_ps(iz2,jz1);
835 dx22 = _mm256_sub_ps(ix2,jx2);
836 dy22 = _mm256_sub_ps(iy2,jy2);
837 dz22 = _mm256_sub_ps(iz2,jz2);
838 dx23 = _mm256_sub_ps(ix2,jx3);
839 dy23 = _mm256_sub_ps(iy2,jy3);
840 dz23 = _mm256_sub_ps(iz2,jz3);
841 dx31 = _mm256_sub_ps(ix3,jx1);
842 dy31 = _mm256_sub_ps(iy3,jy1);
843 dz31 = _mm256_sub_ps(iz3,jz1);
844 dx32 = _mm256_sub_ps(ix3,jx2);
845 dy32 = _mm256_sub_ps(iy3,jy2);
846 dz32 = _mm256_sub_ps(iz3,jz2);
847 dx33 = _mm256_sub_ps(ix3,jx3);
848 dy33 = _mm256_sub_ps(iy3,jy3);
849 dz33 = _mm256_sub_ps(iz3,jz3);
851 /* Calculate squared distance and things based on it */
852 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
853 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
854 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
855 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
856 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
857 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
858 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
859 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
860 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
862 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
863 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
864 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
865 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
866 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
867 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
868 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
869 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
870 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
872 fjx1 = _mm256_setzero_ps();
873 fjy1 = _mm256_setzero_ps();
874 fjz1 = _mm256_setzero_ps();
875 fjx2 = _mm256_setzero_ps();
876 fjy2 = _mm256_setzero_ps();
877 fjz2 = _mm256_setzero_ps();
878 fjx3 = _mm256_setzero_ps();
879 fjy3 = _mm256_setzero_ps();
880 fjz3 = _mm256_setzero_ps();
882 /**************************
883 * CALCULATE INTERACTIONS *
884 **************************/
886 r11 = _mm256_mul_ps(rsq11,rinv11);
887 r11 = _mm256_andnot_ps(dummy_mask,r11);
889 /* Calculate table index by multiplying r with table scale and truncate to integer */
890 rt = _mm256_mul_ps(r11,vftabscale);
891 vfitab = _mm256_cvttps_epi32(rt);
892 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
893 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
894 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
895 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
896 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
897 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
899 /* CUBIC SPLINE TABLE ELECTROSTATICS */
900 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
901 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
902 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
903 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
904 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
905 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
906 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
907 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
908 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
909 Heps = _mm256_mul_ps(vfeps,H);
910 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
911 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
912 velec = _mm256_mul_ps(qq11,VV);
913 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
914 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
916 /* Update potential sum for this i atom from the interaction with this j atom. */
917 velec = _mm256_andnot_ps(dummy_mask,velec);
918 velecsum = _mm256_add_ps(velecsum,velec);
922 fscal = _mm256_andnot_ps(dummy_mask,fscal);
924 /* Calculate temporary vectorial force */
925 tx = _mm256_mul_ps(fscal,dx11);
926 ty = _mm256_mul_ps(fscal,dy11);
927 tz = _mm256_mul_ps(fscal,dz11);
929 /* Update vectorial force */
930 fix1 = _mm256_add_ps(fix1,tx);
931 fiy1 = _mm256_add_ps(fiy1,ty);
932 fiz1 = _mm256_add_ps(fiz1,tz);
934 fjx1 = _mm256_add_ps(fjx1,tx);
935 fjy1 = _mm256_add_ps(fjy1,ty);
936 fjz1 = _mm256_add_ps(fjz1,tz);
938 /**************************
939 * CALCULATE INTERACTIONS *
940 **************************/
942 r12 = _mm256_mul_ps(rsq12,rinv12);
943 r12 = _mm256_andnot_ps(dummy_mask,r12);
945 /* Calculate table index by multiplying r with table scale and truncate to integer */
946 rt = _mm256_mul_ps(r12,vftabscale);
947 vfitab = _mm256_cvttps_epi32(rt);
948 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
949 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
950 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
951 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
952 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
953 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
955 /* CUBIC SPLINE TABLE ELECTROSTATICS */
956 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
957 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
958 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
959 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
960 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
961 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
962 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
963 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
964 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
965 Heps = _mm256_mul_ps(vfeps,H);
966 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
967 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
968 velec = _mm256_mul_ps(qq12,VV);
969 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
970 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
972 /* Update potential sum for this i atom from the interaction with this j atom. */
973 velec = _mm256_andnot_ps(dummy_mask,velec);
974 velecsum = _mm256_add_ps(velecsum,velec);
978 fscal = _mm256_andnot_ps(dummy_mask,fscal);
980 /* Calculate temporary vectorial force */
981 tx = _mm256_mul_ps(fscal,dx12);
982 ty = _mm256_mul_ps(fscal,dy12);
983 tz = _mm256_mul_ps(fscal,dz12);
985 /* Update vectorial force */
986 fix1 = _mm256_add_ps(fix1,tx);
987 fiy1 = _mm256_add_ps(fiy1,ty);
988 fiz1 = _mm256_add_ps(fiz1,tz);
990 fjx2 = _mm256_add_ps(fjx2,tx);
991 fjy2 = _mm256_add_ps(fjy2,ty);
992 fjz2 = _mm256_add_ps(fjz2,tz);
994 /**************************
995 * CALCULATE INTERACTIONS *
996 **************************/
998 r13 = _mm256_mul_ps(rsq13,rinv13);
999 r13 = _mm256_andnot_ps(dummy_mask,r13);
1001 /* Calculate table index by multiplying r with table scale and truncate to integer */
1002 rt = _mm256_mul_ps(r13,vftabscale);
1003 vfitab = _mm256_cvttps_epi32(rt);
1004 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1005 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1006 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1007 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1008 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1009 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1011 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1012 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1013 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1014 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1015 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1016 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1017 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1018 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1019 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1020 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1021 Heps = _mm256_mul_ps(vfeps,H);
1022 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1023 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1024 velec = _mm256_mul_ps(qq13,VV);
1025 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1026 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
1028 /* Update potential sum for this i atom from the interaction with this j atom. */
1029 velec = _mm256_andnot_ps(dummy_mask,velec);
1030 velecsum = _mm256_add_ps(velecsum,velec);
1034 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1036 /* Calculate temporary vectorial force */
1037 tx = _mm256_mul_ps(fscal,dx13);
1038 ty = _mm256_mul_ps(fscal,dy13);
1039 tz = _mm256_mul_ps(fscal,dz13);
1041 /* Update vectorial force */
1042 fix1 = _mm256_add_ps(fix1,tx);
1043 fiy1 = _mm256_add_ps(fiy1,ty);
1044 fiz1 = _mm256_add_ps(fiz1,tz);
1046 fjx3 = _mm256_add_ps(fjx3,tx);
1047 fjy3 = _mm256_add_ps(fjy3,ty);
1048 fjz3 = _mm256_add_ps(fjz3,tz);
1050 /**************************
1051 * CALCULATE INTERACTIONS *
1052 **************************/
1054 r21 = _mm256_mul_ps(rsq21,rinv21);
1055 r21 = _mm256_andnot_ps(dummy_mask,r21);
1057 /* Calculate table index by multiplying r with table scale and truncate to integer */
1058 rt = _mm256_mul_ps(r21,vftabscale);
1059 vfitab = _mm256_cvttps_epi32(rt);
1060 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1061 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1062 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1063 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1064 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1065 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1067 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1068 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1069 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1070 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1071 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1072 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1073 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1074 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1075 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1076 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1077 Heps = _mm256_mul_ps(vfeps,H);
1078 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1079 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1080 velec = _mm256_mul_ps(qq21,VV);
1081 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1082 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1084 /* Update potential sum for this i atom from the interaction with this j atom. */
1085 velec = _mm256_andnot_ps(dummy_mask,velec);
1086 velecsum = _mm256_add_ps(velecsum,velec);
1090 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1092 /* Calculate temporary vectorial force */
1093 tx = _mm256_mul_ps(fscal,dx21);
1094 ty = _mm256_mul_ps(fscal,dy21);
1095 tz = _mm256_mul_ps(fscal,dz21);
1097 /* Update vectorial force */
1098 fix2 = _mm256_add_ps(fix2,tx);
1099 fiy2 = _mm256_add_ps(fiy2,ty);
1100 fiz2 = _mm256_add_ps(fiz2,tz);
1102 fjx1 = _mm256_add_ps(fjx1,tx);
1103 fjy1 = _mm256_add_ps(fjy1,ty);
1104 fjz1 = _mm256_add_ps(fjz1,tz);
1106 /**************************
1107 * CALCULATE INTERACTIONS *
1108 **************************/
1110 r22 = _mm256_mul_ps(rsq22,rinv22);
1111 r22 = _mm256_andnot_ps(dummy_mask,r22);
1113 /* Calculate table index by multiplying r with table scale and truncate to integer */
1114 rt = _mm256_mul_ps(r22,vftabscale);
1115 vfitab = _mm256_cvttps_epi32(rt);
1116 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1117 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1118 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1119 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1120 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1121 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1123 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1124 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1125 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1126 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1127 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1128 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1129 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1130 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1131 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1132 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1133 Heps = _mm256_mul_ps(vfeps,H);
1134 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1135 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1136 velec = _mm256_mul_ps(qq22,VV);
1137 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1138 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1140 /* Update potential sum for this i atom from the interaction with this j atom. */
1141 velec = _mm256_andnot_ps(dummy_mask,velec);
1142 velecsum = _mm256_add_ps(velecsum,velec);
1146 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1148 /* Calculate temporary vectorial force */
1149 tx = _mm256_mul_ps(fscal,dx22);
1150 ty = _mm256_mul_ps(fscal,dy22);
1151 tz = _mm256_mul_ps(fscal,dz22);
1153 /* Update vectorial force */
1154 fix2 = _mm256_add_ps(fix2,tx);
1155 fiy2 = _mm256_add_ps(fiy2,ty);
1156 fiz2 = _mm256_add_ps(fiz2,tz);
1158 fjx2 = _mm256_add_ps(fjx2,tx);
1159 fjy2 = _mm256_add_ps(fjy2,ty);
1160 fjz2 = _mm256_add_ps(fjz2,tz);
1162 /**************************
1163 * CALCULATE INTERACTIONS *
1164 **************************/
1166 r23 = _mm256_mul_ps(rsq23,rinv23);
1167 r23 = _mm256_andnot_ps(dummy_mask,r23);
1169 /* Calculate table index by multiplying r with table scale and truncate to integer */
1170 rt = _mm256_mul_ps(r23,vftabscale);
1171 vfitab = _mm256_cvttps_epi32(rt);
1172 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1173 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1174 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1175 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1176 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1177 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1179 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1180 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1181 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1182 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1183 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1184 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1185 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1186 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1187 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1188 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1189 Heps = _mm256_mul_ps(vfeps,H);
1190 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1191 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1192 velec = _mm256_mul_ps(qq23,VV);
1193 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1194 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
1196 /* Update potential sum for this i atom from the interaction with this j atom. */
1197 velec = _mm256_andnot_ps(dummy_mask,velec);
1198 velecsum = _mm256_add_ps(velecsum,velec);
1202 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1204 /* Calculate temporary vectorial force */
1205 tx = _mm256_mul_ps(fscal,dx23);
1206 ty = _mm256_mul_ps(fscal,dy23);
1207 tz = _mm256_mul_ps(fscal,dz23);
1209 /* Update vectorial force */
1210 fix2 = _mm256_add_ps(fix2,tx);
1211 fiy2 = _mm256_add_ps(fiy2,ty);
1212 fiz2 = _mm256_add_ps(fiz2,tz);
1214 fjx3 = _mm256_add_ps(fjx3,tx);
1215 fjy3 = _mm256_add_ps(fjy3,ty);
1216 fjz3 = _mm256_add_ps(fjz3,tz);
1218 /**************************
1219 * CALCULATE INTERACTIONS *
1220 **************************/
1222 r31 = _mm256_mul_ps(rsq31,rinv31);
1223 r31 = _mm256_andnot_ps(dummy_mask,r31);
1225 /* Calculate table index by multiplying r with table scale and truncate to integer */
1226 rt = _mm256_mul_ps(r31,vftabscale);
1227 vfitab = _mm256_cvttps_epi32(rt);
1228 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1229 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1230 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1231 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1232 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1233 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1235 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1236 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1237 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1238 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1239 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1240 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1241 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1242 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1243 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1244 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1245 Heps = _mm256_mul_ps(vfeps,H);
1246 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1247 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1248 velec = _mm256_mul_ps(qq31,VV);
1249 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1250 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
1252 /* Update potential sum for this i atom from the interaction with this j atom. */
1253 velec = _mm256_andnot_ps(dummy_mask,velec);
1254 velecsum = _mm256_add_ps(velecsum,velec);
1258 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1260 /* Calculate temporary vectorial force */
1261 tx = _mm256_mul_ps(fscal,dx31);
1262 ty = _mm256_mul_ps(fscal,dy31);
1263 tz = _mm256_mul_ps(fscal,dz31);
1265 /* Update vectorial force */
1266 fix3 = _mm256_add_ps(fix3,tx);
1267 fiy3 = _mm256_add_ps(fiy3,ty);
1268 fiz3 = _mm256_add_ps(fiz3,tz);
1270 fjx1 = _mm256_add_ps(fjx1,tx);
1271 fjy1 = _mm256_add_ps(fjy1,ty);
1272 fjz1 = _mm256_add_ps(fjz1,tz);
1274 /**************************
1275 * CALCULATE INTERACTIONS *
1276 **************************/
1278 r32 = _mm256_mul_ps(rsq32,rinv32);
1279 r32 = _mm256_andnot_ps(dummy_mask,r32);
1281 /* Calculate table index by multiplying r with table scale and truncate to integer */
1282 rt = _mm256_mul_ps(r32,vftabscale);
1283 vfitab = _mm256_cvttps_epi32(rt);
1284 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1285 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1286 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1287 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1288 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1289 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1291 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1292 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1293 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1294 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1295 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1296 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1297 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1298 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1299 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1300 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1301 Heps = _mm256_mul_ps(vfeps,H);
1302 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1303 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1304 velec = _mm256_mul_ps(qq32,VV);
1305 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1306 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
1308 /* Update potential sum for this i atom from the interaction with this j atom. */
1309 velec = _mm256_andnot_ps(dummy_mask,velec);
1310 velecsum = _mm256_add_ps(velecsum,velec);
1314 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1316 /* Calculate temporary vectorial force */
1317 tx = _mm256_mul_ps(fscal,dx32);
1318 ty = _mm256_mul_ps(fscal,dy32);
1319 tz = _mm256_mul_ps(fscal,dz32);
1321 /* Update vectorial force */
1322 fix3 = _mm256_add_ps(fix3,tx);
1323 fiy3 = _mm256_add_ps(fiy3,ty);
1324 fiz3 = _mm256_add_ps(fiz3,tz);
1326 fjx2 = _mm256_add_ps(fjx2,tx);
1327 fjy2 = _mm256_add_ps(fjy2,ty);
1328 fjz2 = _mm256_add_ps(fjz2,tz);
1330 /**************************
1331 * CALCULATE INTERACTIONS *
1332 **************************/
1334 r33 = _mm256_mul_ps(rsq33,rinv33);
1335 r33 = _mm256_andnot_ps(dummy_mask,r33);
1337 /* Calculate table index by multiplying r with table scale and truncate to integer */
1338 rt = _mm256_mul_ps(r33,vftabscale);
1339 vfitab = _mm256_cvttps_epi32(rt);
1340 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1341 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1342 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1343 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1344 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1345 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1347 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1348 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1349 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1350 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1351 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1352 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1353 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1354 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1355 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1356 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1357 Heps = _mm256_mul_ps(vfeps,H);
1358 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1359 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1360 velec = _mm256_mul_ps(qq33,VV);
1361 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1362 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
1364 /* Update potential sum for this i atom from the interaction with this j atom. */
1365 velec = _mm256_andnot_ps(dummy_mask,velec);
1366 velecsum = _mm256_add_ps(velecsum,velec);
1370 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1372 /* Calculate temporary vectorial force */
1373 tx = _mm256_mul_ps(fscal,dx33);
1374 ty = _mm256_mul_ps(fscal,dy33);
1375 tz = _mm256_mul_ps(fscal,dz33);
1377 /* Update vectorial force */
1378 fix3 = _mm256_add_ps(fix3,tx);
1379 fiy3 = _mm256_add_ps(fiy3,ty);
1380 fiz3 = _mm256_add_ps(fiz3,tz);
1382 fjx3 = _mm256_add_ps(fjx3,tx);
1383 fjy3 = _mm256_add_ps(fjy3,ty);
1384 fjz3 = _mm256_add_ps(fjz3,tz);
1386 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1387 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1388 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1389 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1390 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1391 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1392 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1393 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1395 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
1396 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
1397 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1399 /* Inner loop uses 396 flops */
1402 /* End of innermost loop */
1404 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1405 f+i_coord_offset+DIM,fshift+i_shift_offset);
1408 /* Update potential energies */
1409 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1411 /* Increment number of inner iterations */
1412 inneriter += j_index_end - j_index_start;
1414 /* Outer loop uses 19 flops */
1417 /* Increment number of outer iterations */
1420 /* Update outer/inner flops */
1422 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_VF,outeriter*19 + inneriter*396);
1425 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_256_single
1426 * Electrostatics interaction: CubicSplineTable
1427 * VdW interaction: None
1428 * Geometry: Water4-Water4
1429 * Calculate force/pot: Force
1432 nb_kernel_ElecCSTab_VdwNone_GeomW4W4_F_avx_256_single
1433 (t_nblist * gmx_restrict nlist,
1434 rvec * gmx_restrict xx,
1435 rvec * gmx_restrict ff,
1436 t_forcerec * gmx_restrict fr,
1437 t_mdatoms * gmx_restrict mdatoms,
1438 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1439 t_nrnb * gmx_restrict nrnb)
1441 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1442 * just 0 for non-waters.
1443 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1444 * jnr indices corresponding to data put in the four positions in the SIMD register.
1446 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1447 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1448 int jnrA,jnrB,jnrC,jnrD;
1449 int jnrE,jnrF,jnrG,jnrH;
1450 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1451 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1452 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1453 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1454 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1455 real rcutoff_scalar;
1456 real *shiftvec,*fshift,*x,*f;
1457 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1458 real scratch[4*DIM];
1459 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1460 real * vdwioffsetptr1;
1461 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1462 real * vdwioffsetptr2;
1463 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1464 real * vdwioffsetptr3;
1465 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1466 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1467 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1468 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1469 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1470 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1471 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1472 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1473 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1474 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1475 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1476 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1477 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1478 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1479 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1480 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1481 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1484 __m128i vfitab_lo,vfitab_hi;
1485 __m128i ifour = _mm_set1_epi32(4);
1486 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1488 __m256 dummy_mask,cutoff_mask;
1489 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1490 __m256 one = _mm256_set1_ps(1.0);
1491 __m256 two = _mm256_set1_ps(2.0);
1497 jindex = nlist->jindex;
1499 shiftidx = nlist->shift;
1501 shiftvec = fr->shift_vec[0];
1502 fshift = fr->fshift[0];
1503 facel = _mm256_set1_ps(fr->epsfac);
1504 charge = mdatoms->chargeA;
1506 vftab = kernel_data->table_elec->data;
1507 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
1509 /* Setup water-specific parameters */
1510 inr = nlist->iinr[0];
1511 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1512 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1513 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1515 jq1 = _mm256_set1_ps(charge[inr+1]);
1516 jq2 = _mm256_set1_ps(charge[inr+2]);
1517 jq3 = _mm256_set1_ps(charge[inr+3]);
1518 qq11 = _mm256_mul_ps(iq1,jq1);
1519 qq12 = _mm256_mul_ps(iq1,jq2);
1520 qq13 = _mm256_mul_ps(iq1,jq3);
1521 qq21 = _mm256_mul_ps(iq2,jq1);
1522 qq22 = _mm256_mul_ps(iq2,jq2);
1523 qq23 = _mm256_mul_ps(iq2,jq3);
1524 qq31 = _mm256_mul_ps(iq3,jq1);
1525 qq32 = _mm256_mul_ps(iq3,jq2);
1526 qq33 = _mm256_mul_ps(iq3,jq3);
1528 /* Avoid stupid compiler warnings */
1529 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1530 j_coord_offsetA = 0;
1531 j_coord_offsetB = 0;
1532 j_coord_offsetC = 0;
1533 j_coord_offsetD = 0;
1534 j_coord_offsetE = 0;
1535 j_coord_offsetF = 0;
1536 j_coord_offsetG = 0;
1537 j_coord_offsetH = 0;
1542 for(iidx=0;iidx<4*DIM;iidx++)
1544 scratch[iidx] = 0.0;
1547 /* Start outer loop over neighborlists */
1548 for(iidx=0; iidx<nri; iidx++)
1550 /* Load shift vector for this list */
1551 i_shift_offset = DIM*shiftidx[iidx];
1553 /* Load limits for loop over neighbors */
1554 j_index_start = jindex[iidx];
1555 j_index_end = jindex[iidx+1];
1557 /* Get outer coordinate index */
1559 i_coord_offset = DIM*inr;
1561 /* Load i particle coords and add shift vector */
1562 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset+DIM,
1563 &ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1565 fix1 = _mm256_setzero_ps();
1566 fiy1 = _mm256_setzero_ps();
1567 fiz1 = _mm256_setzero_ps();
1568 fix2 = _mm256_setzero_ps();
1569 fiy2 = _mm256_setzero_ps();
1570 fiz2 = _mm256_setzero_ps();
1571 fix3 = _mm256_setzero_ps();
1572 fiy3 = _mm256_setzero_ps();
1573 fiz3 = _mm256_setzero_ps();
1575 /* Start inner kernel loop */
1576 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1579 /* Get j neighbor index, and coordinate index */
1581 jnrB = jjnr[jidx+1];
1582 jnrC = jjnr[jidx+2];
1583 jnrD = jjnr[jidx+3];
1584 jnrE = jjnr[jidx+4];
1585 jnrF = jjnr[jidx+5];
1586 jnrG = jjnr[jidx+6];
1587 jnrH = jjnr[jidx+7];
1588 j_coord_offsetA = DIM*jnrA;
1589 j_coord_offsetB = DIM*jnrB;
1590 j_coord_offsetC = DIM*jnrC;
1591 j_coord_offsetD = DIM*jnrD;
1592 j_coord_offsetE = DIM*jnrE;
1593 j_coord_offsetF = DIM*jnrF;
1594 j_coord_offsetG = DIM*jnrG;
1595 j_coord_offsetH = DIM*jnrH;
1597 /* load j atom coordinates */
1598 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
1599 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
1600 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
1601 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
1602 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
1604 /* Calculate displacement vector */
1605 dx11 = _mm256_sub_ps(ix1,jx1);
1606 dy11 = _mm256_sub_ps(iy1,jy1);
1607 dz11 = _mm256_sub_ps(iz1,jz1);
1608 dx12 = _mm256_sub_ps(ix1,jx2);
1609 dy12 = _mm256_sub_ps(iy1,jy2);
1610 dz12 = _mm256_sub_ps(iz1,jz2);
1611 dx13 = _mm256_sub_ps(ix1,jx3);
1612 dy13 = _mm256_sub_ps(iy1,jy3);
1613 dz13 = _mm256_sub_ps(iz1,jz3);
1614 dx21 = _mm256_sub_ps(ix2,jx1);
1615 dy21 = _mm256_sub_ps(iy2,jy1);
1616 dz21 = _mm256_sub_ps(iz2,jz1);
1617 dx22 = _mm256_sub_ps(ix2,jx2);
1618 dy22 = _mm256_sub_ps(iy2,jy2);
1619 dz22 = _mm256_sub_ps(iz2,jz2);
1620 dx23 = _mm256_sub_ps(ix2,jx3);
1621 dy23 = _mm256_sub_ps(iy2,jy3);
1622 dz23 = _mm256_sub_ps(iz2,jz3);
1623 dx31 = _mm256_sub_ps(ix3,jx1);
1624 dy31 = _mm256_sub_ps(iy3,jy1);
1625 dz31 = _mm256_sub_ps(iz3,jz1);
1626 dx32 = _mm256_sub_ps(ix3,jx2);
1627 dy32 = _mm256_sub_ps(iy3,jy2);
1628 dz32 = _mm256_sub_ps(iz3,jz2);
1629 dx33 = _mm256_sub_ps(ix3,jx3);
1630 dy33 = _mm256_sub_ps(iy3,jy3);
1631 dz33 = _mm256_sub_ps(iz3,jz3);
1633 /* Calculate squared distance and things based on it */
1634 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1635 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1636 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1637 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1638 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1639 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1640 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1641 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1642 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1644 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1645 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1646 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1647 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1648 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1649 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1650 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1651 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1652 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1654 fjx1 = _mm256_setzero_ps();
1655 fjy1 = _mm256_setzero_ps();
1656 fjz1 = _mm256_setzero_ps();
1657 fjx2 = _mm256_setzero_ps();
1658 fjy2 = _mm256_setzero_ps();
1659 fjz2 = _mm256_setzero_ps();
1660 fjx3 = _mm256_setzero_ps();
1661 fjy3 = _mm256_setzero_ps();
1662 fjz3 = _mm256_setzero_ps();
1664 /**************************
1665 * CALCULATE INTERACTIONS *
1666 **************************/
1668 r11 = _mm256_mul_ps(rsq11,rinv11);
1670 /* Calculate table index by multiplying r with table scale and truncate to integer */
1671 rt = _mm256_mul_ps(r11,vftabscale);
1672 vfitab = _mm256_cvttps_epi32(rt);
1673 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1674 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1675 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1676 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1677 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1678 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1680 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1681 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1682 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1683 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1684 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1685 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1686 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1687 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1688 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1689 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1690 Heps = _mm256_mul_ps(vfeps,H);
1691 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1692 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1693 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1697 /* Calculate temporary vectorial force */
1698 tx = _mm256_mul_ps(fscal,dx11);
1699 ty = _mm256_mul_ps(fscal,dy11);
1700 tz = _mm256_mul_ps(fscal,dz11);
1702 /* Update vectorial force */
1703 fix1 = _mm256_add_ps(fix1,tx);
1704 fiy1 = _mm256_add_ps(fiy1,ty);
1705 fiz1 = _mm256_add_ps(fiz1,tz);
1707 fjx1 = _mm256_add_ps(fjx1,tx);
1708 fjy1 = _mm256_add_ps(fjy1,ty);
1709 fjz1 = _mm256_add_ps(fjz1,tz);
1711 /**************************
1712 * CALCULATE INTERACTIONS *
1713 **************************/
1715 r12 = _mm256_mul_ps(rsq12,rinv12);
1717 /* Calculate table index by multiplying r with table scale and truncate to integer */
1718 rt = _mm256_mul_ps(r12,vftabscale);
1719 vfitab = _mm256_cvttps_epi32(rt);
1720 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1721 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1722 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1723 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1724 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1725 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1727 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1728 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1729 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1730 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1731 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1732 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1733 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1734 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1735 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1736 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1737 Heps = _mm256_mul_ps(vfeps,H);
1738 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1739 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1740 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1744 /* Calculate temporary vectorial force */
1745 tx = _mm256_mul_ps(fscal,dx12);
1746 ty = _mm256_mul_ps(fscal,dy12);
1747 tz = _mm256_mul_ps(fscal,dz12);
1749 /* Update vectorial force */
1750 fix1 = _mm256_add_ps(fix1,tx);
1751 fiy1 = _mm256_add_ps(fiy1,ty);
1752 fiz1 = _mm256_add_ps(fiz1,tz);
1754 fjx2 = _mm256_add_ps(fjx2,tx);
1755 fjy2 = _mm256_add_ps(fjy2,ty);
1756 fjz2 = _mm256_add_ps(fjz2,tz);
1758 /**************************
1759 * CALCULATE INTERACTIONS *
1760 **************************/
1762 r13 = _mm256_mul_ps(rsq13,rinv13);
1764 /* Calculate table index by multiplying r with table scale and truncate to integer */
1765 rt = _mm256_mul_ps(r13,vftabscale);
1766 vfitab = _mm256_cvttps_epi32(rt);
1767 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1768 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1769 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1770 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1771 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1772 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1774 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1775 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1776 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1777 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1778 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1779 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1780 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1781 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1782 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1783 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1784 Heps = _mm256_mul_ps(vfeps,H);
1785 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1786 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1787 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
1791 /* Calculate temporary vectorial force */
1792 tx = _mm256_mul_ps(fscal,dx13);
1793 ty = _mm256_mul_ps(fscal,dy13);
1794 tz = _mm256_mul_ps(fscal,dz13);
1796 /* Update vectorial force */
1797 fix1 = _mm256_add_ps(fix1,tx);
1798 fiy1 = _mm256_add_ps(fiy1,ty);
1799 fiz1 = _mm256_add_ps(fiz1,tz);
1801 fjx3 = _mm256_add_ps(fjx3,tx);
1802 fjy3 = _mm256_add_ps(fjy3,ty);
1803 fjz3 = _mm256_add_ps(fjz3,tz);
1805 /**************************
1806 * CALCULATE INTERACTIONS *
1807 **************************/
1809 r21 = _mm256_mul_ps(rsq21,rinv21);
1811 /* Calculate table index by multiplying r with table scale and truncate to integer */
1812 rt = _mm256_mul_ps(r21,vftabscale);
1813 vfitab = _mm256_cvttps_epi32(rt);
1814 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1815 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1816 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1817 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1818 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1819 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1821 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1822 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1823 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1824 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1825 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1826 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1827 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1828 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1829 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1830 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1831 Heps = _mm256_mul_ps(vfeps,H);
1832 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1833 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1834 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1838 /* Calculate temporary vectorial force */
1839 tx = _mm256_mul_ps(fscal,dx21);
1840 ty = _mm256_mul_ps(fscal,dy21);
1841 tz = _mm256_mul_ps(fscal,dz21);
1843 /* Update vectorial force */
1844 fix2 = _mm256_add_ps(fix2,tx);
1845 fiy2 = _mm256_add_ps(fiy2,ty);
1846 fiz2 = _mm256_add_ps(fiz2,tz);
1848 fjx1 = _mm256_add_ps(fjx1,tx);
1849 fjy1 = _mm256_add_ps(fjy1,ty);
1850 fjz1 = _mm256_add_ps(fjz1,tz);
1852 /**************************
1853 * CALCULATE INTERACTIONS *
1854 **************************/
1856 r22 = _mm256_mul_ps(rsq22,rinv22);
1858 /* Calculate table index by multiplying r with table scale and truncate to integer */
1859 rt = _mm256_mul_ps(r22,vftabscale);
1860 vfitab = _mm256_cvttps_epi32(rt);
1861 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1862 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1863 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1864 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1865 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1866 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1868 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1869 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1870 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1871 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1872 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1873 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1874 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1875 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1876 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1877 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1878 Heps = _mm256_mul_ps(vfeps,H);
1879 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1880 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1881 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1885 /* Calculate temporary vectorial force */
1886 tx = _mm256_mul_ps(fscal,dx22);
1887 ty = _mm256_mul_ps(fscal,dy22);
1888 tz = _mm256_mul_ps(fscal,dz22);
1890 /* Update vectorial force */
1891 fix2 = _mm256_add_ps(fix2,tx);
1892 fiy2 = _mm256_add_ps(fiy2,ty);
1893 fiz2 = _mm256_add_ps(fiz2,tz);
1895 fjx2 = _mm256_add_ps(fjx2,tx);
1896 fjy2 = _mm256_add_ps(fjy2,ty);
1897 fjz2 = _mm256_add_ps(fjz2,tz);
1899 /**************************
1900 * CALCULATE INTERACTIONS *
1901 **************************/
1903 r23 = _mm256_mul_ps(rsq23,rinv23);
1905 /* Calculate table index by multiplying r with table scale and truncate to integer */
1906 rt = _mm256_mul_ps(r23,vftabscale);
1907 vfitab = _mm256_cvttps_epi32(rt);
1908 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1909 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1910 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1911 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1912 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1913 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1915 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1916 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1917 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1918 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1919 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1920 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1921 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1922 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1923 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1924 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1925 Heps = _mm256_mul_ps(vfeps,H);
1926 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1927 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1928 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
1932 /* Calculate temporary vectorial force */
1933 tx = _mm256_mul_ps(fscal,dx23);
1934 ty = _mm256_mul_ps(fscal,dy23);
1935 tz = _mm256_mul_ps(fscal,dz23);
1937 /* Update vectorial force */
1938 fix2 = _mm256_add_ps(fix2,tx);
1939 fiy2 = _mm256_add_ps(fiy2,ty);
1940 fiz2 = _mm256_add_ps(fiz2,tz);
1942 fjx3 = _mm256_add_ps(fjx3,tx);
1943 fjy3 = _mm256_add_ps(fjy3,ty);
1944 fjz3 = _mm256_add_ps(fjz3,tz);
1946 /**************************
1947 * CALCULATE INTERACTIONS *
1948 **************************/
1950 r31 = _mm256_mul_ps(rsq31,rinv31);
1952 /* Calculate table index by multiplying r with table scale and truncate to integer */
1953 rt = _mm256_mul_ps(r31,vftabscale);
1954 vfitab = _mm256_cvttps_epi32(rt);
1955 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1956 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1957 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1958 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1959 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1960 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1962 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1963 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1964 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1965 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1966 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1967 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1968 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1969 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1970 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1971 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1972 Heps = _mm256_mul_ps(vfeps,H);
1973 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1974 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1975 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
1979 /* Calculate temporary vectorial force */
1980 tx = _mm256_mul_ps(fscal,dx31);
1981 ty = _mm256_mul_ps(fscal,dy31);
1982 tz = _mm256_mul_ps(fscal,dz31);
1984 /* Update vectorial force */
1985 fix3 = _mm256_add_ps(fix3,tx);
1986 fiy3 = _mm256_add_ps(fiy3,ty);
1987 fiz3 = _mm256_add_ps(fiz3,tz);
1989 fjx1 = _mm256_add_ps(fjx1,tx);
1990 fjy1 = _mm256_add_ps(fjy1,ty);
1991 fjz1 = _mm256_add_ps(fjz1,tz);
1993 /**************************
1994 * CALCULATE INTERACTIONS *
1995 **************************/
1997 r32 = _mm256_mul_ps(rsq32,rinv32);
1999 /* Calculate table index by multiplying r with table scale and truncate to integer */
2000 rt = _mm256_mul_ps(r32,vftabscale);
2001 vfitab = _mm256_cvttps_epi32(rt);
2002 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2003 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2004 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2005 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2006 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2007 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2009 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2010 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2011 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2012 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2013 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2014 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2015 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2016 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2017 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2018 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2019 Heps = _mm256_mul_ps(vfeps,H);
2020 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2021 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2022 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2026 /* Calculate temporary vectorial force */
2027 tx = _mm256_mul_ps(fscal,dx32);
2028 ty = _mm256_mul_ps(fscal,dy32);
2029 tz = _mm256_mul_ps(fscal,dz32);
2031 /* Update vectorial force */
2032 fix3 = _mm256_add_ps(fix3,tx);
2033 fiy3 = _mm256_add_ps(fiy3,ty);
2034 fiz3 = _mm256_add_ps(fiz3,tz);
2036 fjx2 = _mm256_add_ps(fjx2,tx);
2037 fjy2 = _mm256_add_ps(fjy2,ty);
2038 fjz2 = _mm256_add_ps(fjz2,tz);
2040 /**************************
2041 * CALCULATE INTERACTIONS *
2042 **************************/
2044 r33 = _mm256_mul_ps(rsq33,rinv33);
2046 /* Calculate table index by multiplying r with table scale and truncate to integer */
2047 rt = _mm256_mul_ps(r33,vftabscale);
2048 vfitab = _mm256_cvttps_epi32(rt);
2049 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2050 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2051 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2052 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2053 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2054 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2056 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2057 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2058 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2059 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2060 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2061 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2062 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2063 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2064 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2065 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2066 Heps = _mm256_mul_ps(vfeps,H);
2067 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2068 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2069 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2073 /* Calculate temporary vectorial force */
2074 tx = _mm256_mul_ps(fscal,dx33);
2075 ty = _mm256_mul_ps(fscal,dy33);
2076 tz = _mm256_mul_ps(fscal,dz33);
2078 /* Update vectorial force */
2079 fix3 = _mm256_add_ps(fix3,tx);
2080 fiy3 = _mm256_add_ps(fiy3,ty);
2081 fiz3 = _mm256_add_ps(fiz3,tz);
2083 fjx3 = _mm256_add_ps(fjx3,tx);
2084 fjy3 = _mm256_add_ps(fjy3,ty);
2085 fjz3 = _mm256_add_ps(fjz3,tz);
2087 fjptrA = f+j_coord_offsetA;
2088 fjptrB = f+j_coord_offsetB;
2089 fjptrC = f+j_coord_offsetC;
2090 fjptrD = f+j_coord_offsetD;
2091 fjptrE = f+j_coord_offsetE;
2092 fjptrF = f+j_coord_offsetF;
2093 fjptrG = f+j_coord_offsetG;
2094 fjptrH = f+j_coord_offsetH;
2096 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2097 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2098 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2100 /* Inner loop uses 351 flops */
2103 if(jidx<j_index_end)
2106 /* Get j neighbor index, and coordinate index */
2107 jnrlistA = jjnr[jidx];
2108 jnrlistB = jjnr[jidx+1];
2109 jnrlistC = jjnr[jidx+2];
2110 jnrlistD = jjnr[jidx+3];
2111 jnrlistE = jjnr[jidx+4];
2112 jnrlistF = jjnr[jidx+5];
2113 jnrlistG = jjnr[jidx+6];
2114 jnrlistH = jjnr[jidx+7];
2115 /* Sign of each element will be negative for non-real atoms.
2116 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2117 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2119 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2120 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2122 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2123 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2124 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2125 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2126 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2127 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2128 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2129 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2130 j_coord_offsetA = DIM*jnrA;
2131 j_coord_offsetB = DIM*jnrB;
2132 j_coord_offsetC = DIM*jnrC;
2133 j_coord_offsetD = DIM*jnrD;
2134 j_coord_offsetE = DIM*jnrE;
2135 j_coord_offsetF = DIM*jnrF;
2136 j_coord_offsetG = DIM*jnrG;
2137 j_coord_offsetH = DIM*jnrH;
2139 /* load j atom coordinates */
2140 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA+DIM,x+j_coord_offsetB+DIM,
2141 x+j_coord_offsetC+DIM,x+j_coord_offsetD+DIM,
2142 x+j_coord_offsetE+DIM,x+j_coord_offsetF+DIM,
2143 x+j_coord_offsetG+DIM,x+j_coord_offsetH+DIM,
2144 &jx1,&jy1,&jz1,&jx2,&jy2,&jz2,&jx3,&jy3,&jz3);
2146 /* Calculate displacement vector */
2147 dx11 = _mm256_sub_ps(ix1,jx1);
2148 dy11 = _mm256_sub_ps(iy1,jy1);
2149 dz11 = _mm256_sub_ps(iz1,jz1);
2150 dx12 = _mm256_sub_ps(ix1,jx2);
2151 dy12 = _mm256_sub_ps(iy1,jy2);
2152 dz12 = _mm256_sub_ps(iz1,jz2);
2153 dx13 = _mm256_sub_ps(ix1,jx3);
2154 dy13 = _mm256_sub_ps(iy1,jy3);
2155 dz13 = _mm256_sub_ps(iz1,jz3);
2156 dx21 = _mm256_sub_ps(ix2,jx1);
2157 dy21 = _mm256_sub_ps(iy2,jy1);
2158 dz21 = _mm256_sub_ps(iz2,jz1);
2159 dx22 = _mm256_sub_ps(ix2,jx2);
2160 dy22 = _mm256_sub_ps(iy2,jy2);
2161 dz22 = _mm256_sub_ps(iz2,jz2);
2162 dx23 = _mm256_sub_ps(ix2,jx3);
2163 dy23 = _mm256_sub_ps(iy2,jy3);
2164 dz23 = _mm256_sub_ps(iz2,jz3);
2165 dx31 = _mm256_sub_ps(ix3,jx1);
2166 dy31 = _mm256_sub_ps(iy3,jy1);
2167 dz31 = _mm256_sub_ps(iz3,jz1);
2168 dx32 = _mm256_sub_ps(ix3,jx2);
2169 dy32 = _mm256_sub_ps(iy3,jy2);
2170 dz32 = _mm256_sub_ps(iz3,jz2);
2171 dx33 = _mm256_sub_ps(ix3,jx3);
2172 dy33 = _mm256_sub_ps(iy3,jy3);
2173 dz33 = _mm256_sub_ps(iz3,jz3);
2175 /* Calculate squared distance and things based on it */
2176 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2177 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2178 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2179 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2180 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2181 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2182 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2183 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2184 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2186 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2187 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2188 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2189 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2190 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2191 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2192 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2193 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2194 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2196 fjx1 = _mm256_setzero_ps();
2197 fjy1 = _mm256_setzero_ps();
2198 fjz1 = _mm256_setzero_ps();
2199 fjx2 = _mm256_setzero_ps();
2200 fjy2 = _mm256_setzero_ps();
2201 fjz2 = _mm256_setzero_ps();
2202 fjx3 = _mm256_setzero_ps();
2203 fjy3 = _mm256_setzero_ps();
2204 fjz3 = _mm256_setzero_ps();
2206 /**************************
2207 * CALCULATE INTERACTIONS *
2208 **************************/
2210 r11 = _mm256_mul_ps(rsq11,rinv11);
2211 r11 = _mm256_andnot_ps(dummy_mask,r11);
2213 /* Calculate table index by multiplying r with table scale and truncate to integer */
2214 rt = _mm256_mul_ps(r11,vftabscale);
2215 vfitab = _mm256_cvttps_epi32(rt);
2216 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2217 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2218 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2219 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2220 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2221 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2223 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2224 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2225 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2226 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2227 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2228 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2229 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2230 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2231 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2232 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2233 Heps = _mm256_mul_ps(vfeps,H);
2234 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2235 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2236 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2240 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2242 /* Calculate temporary vectorial force */
2243 tx = _mm256_mul_ps(fscal,dx11);
2244 ty = _mm256_mul_ps(fscal,dy11);
2245 tz = _mm256_mul_ps(fscal,dz11);
2247 /* Update vectorial force */
2248 fix1 = _mm256_add_ps(fix1,tx);
2249 fiy1 = _mm256_add_ps(fiy1,ty);
2250 fiz1 = _mm256_add_ps(fiz1,tz);
2252 fjx1 = _mm256_add_ps(fjx1,tx);
2253 fjy1 = _mm256_add_ps(fjy1,ty);
2254 fjz1 = _mm256_add_ps(fjz1,tz);
2256 /**************************
2257 * CALCULATE INTERACTIONS *
2258 **************************/
2260 r12 = _mm256_mul_ps(rsq12,rinv12);
2261 r12 = _mm256_andnot_ps(dummy_mask,r12);
2263 /* Calculate table index by multiplying r with table scale and truncate to integer */
2264 rt = _mm256_mul_ps(r12,vftabscale);
2265 vfitab = _mm256_cvttps_epi32(rt);
2266 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2267 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2268 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2269 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2270 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2271 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2273 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2274 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2275 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2276 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2277 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2278 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2279 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2280 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2281 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2282 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2283 Heps = _mm256_mul_ps(vfeps,H);
2284 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2285 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2286 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2290 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2292 /* Calculate temporary vectorial force */
2293 tx = _mm256_mul_ps(fscal,dx12);
2294 ty = _mm256_mul_ps(fscal,dy12);
2295 tz = _mm256_mul_ps(fscal,dz12);
2297 /* Update vectorial force */
2298 fix1 = _mm256_add_ps(fix1,tx);
2299 fiy1 = _mm256_add_ps(fiy1,ty);
2300 fiz1 = _mm256_add_ps(fiz1,tz);
2302 fjx2 = _mm256_add_ps(fjx2,tx);
2303 fjy2 = _mm256_add_ps(fjy2,ty);
2304 fjz2 = _mm256_add_ps(fjz2,tz);
2306 /**************************
2307 * CALCULATE INTERACTIONS *
2308 **************************/
2310 r13 = _mm256_mul_ps(rsq13,rinv13);
2311 r13 = _mm256_andnot_ps(dummy_mask,r13);
2313 /* Calculate table index by multiplying r with table scale and truncate to integer */
2314 rt = _mm256_mul_ps(r13,vftabscale);
2315 vfitab = _mm256_cvttps_epi32(rt);
2316 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2317 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2318 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2319 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2320 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2321 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2323 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2324 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2325 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2326 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2327 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2328 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2329 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2330 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2331 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2332 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2333 Heps = _mm256_mul_ps(vfeps,H);
2334 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2335 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2336 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
2340 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2342 /* Calculate temporary vectorial force */
2343 tx = _mm256_mul_ps(fscal,dx13);
2344 ty = _mm256_mul_ps(fscal,dy13);
2345 tz = _mm256_mul_ps(fscal,dz13);
2347 /* Update vectorial force */
2348 fix1 = _mm256_add_ps(fix1,tx);
2349 fiy1 = _mm256_add_ps(fiy1,ty);
2350 fiz1 = _mm256_add_ps(fiz1,tz);
2352 fjx3 = _mm256_add_ps(fjx3,tx);
2353 fjy3 = _mm256_add_ps(fjy3,ty);
2354 fjz3 = _mm256_add_ps(fjz3,tz);
2356 /**************************
2357 * CALCULATE INTERACTIONS *
2358 **************************/
2360 r21 = _mm256_mul_ps(rsq21,rinv21);
2361 r21 = _mm256_andnot_ps(dummy_mask,r21);
2363 /* Calculate table index by multiplying r with table scale and truncate to integer */
2364 rt = _mm256_mul_ps(r21,vftabscale);
2365 vfitab = _mm256_cvttps_epi32(rt);
2366 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2367 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2368 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2369 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2370 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2371 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2373 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2374 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2375 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2376 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2377 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2378 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2379 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2380 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2381 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2382 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2383 Heps = _mm256_mul_ps(vfeps,H);
2384 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2385 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2386 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2390 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2392 /* Calculate temporary vectorial force */
2393 tx = _mm256_mul_ps(fscal,dx21);
2394 ty = _mm256_mul_ps(fscal,dy21);
2395 tz = _mm256_mul_ps(fscal,dz21);
2397 /* Update vectorial force */
2398 fix2 = _mm256_add_ps(fix2,tx);
2399 fiy2 = _mm256_add_ps(fiy2,ty);
2400 fiz2 = _mm256_add_ps(fiz2,tz);
2402 fjx1 = _mm256_add_ps(fjx1,tx);
2403 fjy1 = _mm256_add_ps(fjy1,ty);
2404 fjz1 = _mm256_add_ps(fjz1,tz);
2406 /**************************
2407 * CALCULATE INTERACTIONS *
2408 **************************/
2410 r22 = _mm256_mul_ps(rsq22,rinv22);
2411 r22 = _mm256_andnot_ps(dummy_mask,r22);
2413 /* Calculate table index by multiplying r with table scale and truncate to integer */
2414 rt = _mm256_mul_ps(r22,vftabscale);
2415 vfitab = _mm256_cvttps_epi32(rt);
2416 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2417 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2418 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2419 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2420 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2421 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2423 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2424 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2425 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2426 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2427 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2428 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2429 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2430 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2431 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2432 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2433 Heps = _mm256_mul_ps(vfeps,H);
2434 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2435 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2436 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2440 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2442 /* Calculate temporary vectorial force */
2443 tx = _mm256_mul_ps(fscal,dx22);
2444 ty = _mm256_mul_ps(fscal,dy22);
2445 tz = _mm256_mul_ps(fscal,dz22);
2447 /* Update vectorial force */
2448 fix2 = _mm256_add_ps(fix2,tx);
2449 fiy2 = _mm256_add_ps(fiy2,ty);
2450 fiz2 = _mm256_add_ps(fiz2,tz);
2452 fjx2 = _mm256_add_ps(fjx2,tx);
2453 fjy2 = _mm256_add_ps(fjy2,ty);
2454 fjz2 = _mm256_add_ps(fjz2,tz);
2456 /**************************
2457 * CALCULATE INTERACTIONS *
2458 **************************/
2460 r23 = _mm256_mul_ps(rsq23,rinv23);
2461 r23 = _mm256_andnot_ps(dummy_mask,r23);
2463 /* Calculate table index by multiplying r with table scale and truncate to integer */
2464 rt = _mm256_mul_ps(r23,vftabscale);
2465 vfitab = _mm256_cvttps_epi32(rt);
2466 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2467 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2468 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2469 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2470 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2471 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2473 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2474 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2475 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2476 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2477 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2478 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2479 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2480 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2481 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2482 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2483 Heps = _mm256_mul_ps(vfeps,H);
2484 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2485 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2486 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
2490 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2492 /* Calculate temporary vectorial force */
2493 tx = _mm256_mul_ps(fscal,dx23);
2494 ty = _mm256_mul_ps(fscal,dy23);
2495 tz = _mm256_mul_ps(fscal,dz23);
2497 /* Update vectorial force */
2498 fix2 = _mm256_add_ps(fix2,tx);
2499 fiy2 = _mm256_add_ps(fiy2,ty);
2500 fiz2 = _mm256_add_ps(fiz2,tz);
2502 fjx3 = _mm256_add_ps(fjx3,tx);
2503 fjy3 = _mm256_add_ps(fjy3,ty);
2504 fjz3 = _mm256_add_ps(fjz3,tz);
2506 /**************************
2507 * CALCULATE INTERACTIONS *
2508 **************************/
2510 r31 = _mm256_mul_ps(rsq31,rinv31);
2511 r31 = _mm256_andnot_ps(dummy_mask,r31);
2513 /* Calculate table index by multiplying r with table scale and truncate to integer */
2514 rt = _mm256_mul_ps(r31,vftabscale);
2515 vfitab = _mm256_cvttps_epi32(rt);
2516 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2517 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2518 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2519 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2520 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2521 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2523 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2524 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2525 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2526 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2527 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2528 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2529 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2530 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2531 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2532 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2533 Heps = _mm256_mul_ps(vfeps,H);
2534 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2535 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2536 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
2540 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2542 /* Calculate temporary vectorial force */
2543 tx = _mm256_mul_ps(fscal,dx31);
2544 ty = _mm256_mul_ps(fscal,dy31);
2545 tz = _mm256_mul_ps(fscal,dz31);
2547 /* Update vectorial force */
2548 fix3 = _mm256_add_ps(fix3,tx);
2549 fiy3 = _mm256_add_ps(fiy3,ty);
2550 fiz3 = _mm256_add_ps(fiz3,tz);
2552 fjx1 = _mm256_add_ps(fjx1,tx);
2553 fjy1 = _mm256_add_ps(fjy1,ty);
2554 fjz1 = _mm256_add_ps(fjz1,tz);
2556 /**************************
2557 * CALCULATE INTERACTIONS *
2558 **************************/
2560 r32 = _mm256_mul_ps(rsq32,rinv32);
2561 r32 = _mm256_andnot_ps(dummy_mask,r32);
2563 /* Calculate table index by multiplying r with table scale and truncate to integer */
2564 rt = _mm256_mul_ps(r32,vftabscale);
2565 vfitab = _mm256_cvttps_epi32(rt);
2566 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2567 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2568 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2569 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2570 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2571 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2573 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2574 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2575 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2576 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2577 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2578 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2579 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2580 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2581 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2582 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2583 Heps = _mm256_mul_ps(vfeps,H);
2584 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2585 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2586 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2590 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2592 /* Calculate temporary vectorial force */
2593 tx = _mm256_mul_ps(fscal,dx32);
2594 ty = _mm256_mul_ps(fscal,dy32);
2595 tz = _mm256_mul_ps(fscal,dz32);
2597 /* Update vectorial force */
2598 fix3 = _mm256_add_ps(fix3,tx);
2599 fiy3 = _mm256_add_ps(fiy3,ty);
2600 fiz3 = _mm256_add_ps(fiz3,tz);
2602 fjx2 = _mm256_add_ps(fjx2,tx);
2603 fjy2 = _mm256_add_ps(fjy2,ty);
2604 fjz2 = _mm256_add_ps(fjz2,tz);
2606 /**************************
2607 * CALCULATE INTERACTIONS *
2608 **************************/
2610 r33 = _mm256_mul_ps(rsq33,rinv33);
2611 r33 = _mm256_andnot_ps(dummy_mask,r33);
2613 /* Calculate table index by multiplying r with table scale and truncate to integer */
2614 rt = _mm256_mul_ps(r33,vftabscale);
2615 vfitab = _mm256_cvttps_epi32(rt);
2616 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2617 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2618 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2619 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2620 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2621 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2623 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2624 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2625 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2626 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2627 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2628 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2629 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2630 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2631 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2632 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2633 Heps = _mm256_mul_ps(vfeps,H);
2634 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2635 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2636 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2640 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2642 /* Calculate temporary vectorial force */
2643 tx = _mm256_mul_ps(fscal,dx33);
2644 ty = _mm256_mul_ps(fscal,dy33);
2645 tz = _mm256_mul_ps(fscal,dz33);
2647 /* Update vectorial force */
2648 fix3 = _mm256_add_ps(fix3,tx);
2649 fiy3 = _mm256_add_ps(fiy3,ty);
2650 fiz3 = _mm256_add_ps(fiz3,tz);
2652 fjx3 = _mm256_add_ps(fjx3,tx);
2653 fjy3 = _mm256_add_ps(fjy3,ty);
2654 fjz3 = _mm256_add_ps(fjz3,tz);
2656 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2657 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2658 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2659 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2660 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2661 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2662 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2663 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2665 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA+DIM,fjptrB+DIM,fjptrC+DIM,fjptrD+DIM,
2666 fjptrE+DIM,fjptrF+DIM,fjptrG+DIM,fjptrH+DIM,
2667 fjx1,fjy1,fjz1,fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2669 /* Inner loop uses 360 flops */
2672 /* End of innermost loop */
2674 gmx_mm256_update_iforce_3atom_swizzle_ps(fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
2675 f+i_coord_offset+DIM,fshift+i_shift_offset);
2677 /* Increment number of inner iterations */
2678 inneriter += j_index_end - j_index_start;
2680 /* Outer loop uses 18 flops */
2683 /* Increment number of outer iterations */
2686 /* Update outer/inner flops */
2688 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W4W4_F,outeriter*18 + inneriter*360);