2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "gromacs/legacyheaders/types/simple.h"
44 #include "gromacs/math/vec.h"
45 #include "gromacs/legacyheaders/nrnb.h"
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_single
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: None
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_VF_avx_256_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrE,jnrF,jnrG,jnrH;
76 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
85 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
93 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
95 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
97 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
110 __m128i vfitab_lo,vfitab_hi;
111 __m128i ifour = _mm_set1_epi32(4);
112 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
114 __m256 dummy_mask,cutoff_mask;
115 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
116 __m256 one = _mm256_set1_ps(1.0);
117 __m256 two = _mm256_set1_ps(2.0);
123 jindex = nlist->jindex;
125 shiftidx = nlist->shift;
127 shiftvec = fr->shift_vec[0];
128 fshift = fr->fshift[0];
129 facel = _mm256_set1_ps(fr->epsfac);
130 charge = mdatoms->chargeA;
132 vftab = kernel_data->table_elec->data;
133 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
135 /* Setup water-specific parameters */
136 inr = nlist->iinr[0];
137 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
138 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
139 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
141 jq0 = _mm256_set1_ps(charge[inr+0]);
142 jq1 = _mm256_set1_ps(charge[inr+1]);
143 jq2 = _mm256_set1_ps(charge[inr+2]);
144 qq00 = _mm256_mul_ps(iq0,jq0);
145 qq01 = _mm256_mul_ps(iq0,jq1);
146 qq02 = _mm256_mul_ps(iq0,jq2);
147 qq10 = _mm256_mul_ps(iq1,jq0);
148 qq11 = _mm256_mul_ps(iq1,jq1);
149 qq12 = _mm256_mul_ps(iq1,jq2);
150 qq20 = _mm256_mul_ps(iq2,jq0);
151 qq21 = _mm256_mul_ps(iq2,jq1);
152 qq22 = _mm256_mul_ps(iq2,jq2);
154 /* Avoid stupid compiler warnings */
155 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
168 for(iidx=0;iidx<4*DIM;iidx++)
173 /* Start outer loop over neighborlists */
174 for(iidx=0; iidx<nri; iidx++)
176 /* Load shift vector for this list */
177 i_shift_offset = DIM*shiftidx[iidx];
179 /* Load limits for loop over neighbors */
180 j_index_start = jindex[iidx];
181 j_index_end = jindex[iidx+1];
183 /* Get outer coordinate index */
185 i_coord_offset = DIM*inr;
187 /* Load i particle coords and add shift vector */
188 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
189 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
191 fix0 = _mm256_setzero_ps();
192 fiy0 = _mm256_setzero_ps();
193 fiz0 = _mm256_setzero_ps();
194 fix1 = _mm256_setzero_ps();
195 fiy1 = _mm256_setzero_ps();
196 fiz1 = _mm256_setzero_ps();
197 fix2 = _mm256_setzero_ps();
198 fiy2 = _mm256_setzero_ps();
199 fiz2 = _mm256_setzero_ps();
201 /* Reset potential sums */
202 velecsum = _mm256_setzero_ps();
204 /* Start inner kernel loop */
205 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
208 /* Get j neighbor index, and coordinate index */
217 j_coord_offsetA = DIM*jnrA;
218 j_coord_offsetB = DIM*jnrB;
219 j_coord_offsetC = DIM*jnrC;
220 j_coord_offsetD = DIM*jnrD;
221 j_coord_offsetE = DIM*jnrE;
222 j_coord_offsetF = DIM*jnrF;
223 j_coord_offsetG = DIM*jnrG;
224 j_coord_offsetH = DIM*jnrH;
226 /* load j atom coordinates */
227 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
228 x+j_coord_offsetC,x+j_coord_offsetD,
229 x+j_coord_offsetE,x+j_coord_offsetF,
230 x+j_coord_offsetG,x+j_coord_offsetH,
231 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
233 /* Calculate displacement vector */
234 dx00 = _mm256_sub_ps(ix0,jx0);
235 dy00 = _mm256_sub_ps(iy0,jy0);
236 dz00 = _mm256_sub_ps(iz0,jz0);
237 dx01 = _mm256_sub_ps(ix0,jx1);
238 dy01 = _mm256_sub_ps(iy0,jy1);
239 dz01 = _mm256_sub_ps(iz0,jz1);
240 dx02 = _mm256_sub_ps(ix0,jx2);
241 dy02 = _mm256_sub_ps(iy0,jy2);
242 dz02 = _mm256_sub_ps(iz0,jz2);
243 dx10 = _mm256_sub_ps(ix1,jx0);
244 dy10 = _mm256_sub_ps(iy1,jy0);
245 dz10 = _mm256_sub_ps(iz1,jz0);
246 dx11 = _mm256_sub_ps(ix1,jx1);
247 dy11 = _mm256_sub_ps(iy1,jy1);
248 dz11 = _mm256_sub_ps(iz1,jz1);
249 dx12 = _mm256_sub_ps(ix1,jx2);
250 dy12 = _mm256_sub_ps(iy1,jy2);
251 dz12 = _mm256_sub_ps(iz1,jz2);
252 dx20 = _mm256_sub_ps(ix2,jx0);
253 dy20 = _mm256_sub_ps(iy2,jy0);
254 dz20 = _mm256_sub_ps(iz2,jz0);
255 dx21 = _mm256_sub_ps(ix2,jx1);
256 dy21 = _mm256_sub_ps(iy2,jy1);
257 dz21 = _mm256_sub_ps(iz2,jz1);
258 dx22 = _mm256_sub_ps(ix2,jx2);
259 dy22 = _mm256_sub_ps(iy2,jy2);
260 dz22 = _mm256_sub_ps(iz2,jz2);
262 /* Calculate squared distance and things based on it */
263 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
264 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
265 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
266 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
267 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
268 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
269 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
270 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
271 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
273 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
274 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
275 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
276 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
277 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
278 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
279 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
280 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
281 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
283 fjx0 = _mm256_setzero_ps();
284 fjy0 = _mm256_setzero_ps();
285 fjz0 = _mm256_setzero_ps();
286 fjx1 = _mm256_setzero_ps();
287 fjy1 = _mm256_setzero_ps();
288 fjz1 = _mm256_setzero_ps();
289 fjx2 = _mm256_setzero_ps();
290 fjy2 = _mm256_setzero_ps();
291 fjz2 = _mm256_setzero_ps();
293 /**************************
294 * CALCULATE INTERACTIONS *
295 **************************/
297 r00 = _mm256_mul_ps(rsq00,rinv00);
299 /* Calculate table index by multiplying r with table scale and truncate to integer */
300 rt = _mm256_mul_ps(r00,vftabscale);
301 vfitab = _mm256_cvttps_epi32(rt);
302 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
303 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
304 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
305 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
306 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
307 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
309 /* CUBIC SPLINE TABLE ELECTROSTATICS */
310 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
311 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
312 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
313 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
314 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
315 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
316 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
317 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
318 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
319 Heps = _mm256_mul_ps(vfeps,H);
320 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
321 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
322 velec = _mm256_mul_ps(qq00,VV);
323 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
324 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
326 /* Update potential sum for this i atom from the interaction with this j atom. */
327 velecsum = _mm256_add_ps(velecsum,velec);
331 /* Calculate temporary vectorial force */
332 tx = _mm256_mul_ps(fscal,dx00);
333 ty = _mm256_mul_ps(fscal,dy00);
334 tz = _mm256_mul_ps(fscal,dz00);
336 /* Update vectorial force */
337 fix0 = _mm256_add_ps(fix0,tx);
338 fiy0 = _mm256_add_ps(fiy0,ty);
339 fiz0 = _mm256_add_ps(fiz0,tz);
341 fjx0 = _mm256_add_ps(fjx0,tx);
342 fjy0 = _mm256_add_ps(fjy0,ty);
343 fjz0 = _mm256_add_ps(fjz0,tz);
345 /**************************
346 * CALCULATE INTERACTIONS *
347 **************************/
349 r01 = _mm256_mul_ps(rsq01,rinv01);
351 /* Calculate table index by multiplying r with table scale and truncate to integer */
352 rt = _mm256_mul_ps(r01,vftabscale);
353 vfitab = _mm256_cvttps_epi32(rt);
354 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
355 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
356 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
357 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
358 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
359 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
361 /* CUBIC SPLINE TABLE ELECTROSTATICS */
362 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
363 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
364 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
365 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
366 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
367 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
368 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
369 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
370 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
371 Heps = _mm256_mul_ps(vfeps,H);
372 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
373 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
374 velec = _mm256_mul_ps(qq01,VV);
375 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
376 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
378 /* Update potential sum for this i atom from the interaction with this j atom. */
379 velecsum = _mm256_add_ps(velecsum,velec);
383 /* Calculate temporary vectorial force */
384 tx = _mm256_mul_ps(fscal,dx01);
385 ty = _mm256_mul_ps(fscal,dy01);
386 tz = _mm256_mul_ps(fscal,dz01);
388 /* Update vectorial force */
389 fix0 = _mm256_add_ps(fix0,tx);
390 fiy0 = _mm256_add_ps(fiy0,ty);
391 fiz0 = _mm256_add_ps(fiz0,tz);
393 fjx1 = _mm256_add_ps(fjx1,tx);
394 fjy1 = _mm256_add_ps(fjy1,ty);
395 fjz1 = _mm256_add_ps(fjz1,tz);
397 /**************************
398 * CALCULATE INTERACTIONS *
399 **************************/
401 r02 = _mm256_mul_ps(rsq02,rinv02);
403 /* Calculate table index by multiplying r with table scale and truncate to integer */
404 rt = _mm256_mul_ps(r02,vftabscale);
405 vfitab = _mm256_cvttps_epi32(rt);
406 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
407 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
408 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
409 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
410 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
411 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
413 /* CUBIC SPLINE TABLE ELECTROSTATICS */
414 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
415 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
416 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
417 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
418 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
419 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
420 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
421 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
422 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
423 Heps = _mm256_mul_ps(vfeps,H);
424 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
425 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
426 velec = _mm256_mul_ps(qq02,VV);
427 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
428 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velecsum = _mm256_add_ps(velecsum,velec);
435 /* Calculate temporary vectorial force */
436 tx = _mm256_mul_ps(fscal,dx02);
437 ty = _mm256_mul_ps(fscal,dy02);
438 tz = _mm256_mul_ps(fscal,dz02);
440 /* Update vectorial force */
441 fix0 = _mm256_add_ps(fix0,tx);
442 fiy0 = _mm256_add_ps(fiy0,ty);
443 fiz0 = _mm256_add_ps(fiz0,tz);
445 fjx2 = _mm256_add_ps(fjx2,tx);
446 fjy2 = _mm256_add_ps(fjy2,ty);
447 fjz2 = _mm256_add_ps(fjz2,tz);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 r10 = _mm256_mul_ps(rsq10,rinv10);
455 /* Calculate table index by multiplying r with table scale and truncate to integer */
456 rt = _mm256_mul_ps(r10,vftabscale);
457 vfitab = _mm256_cvttps_epi32(rt);
458 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
459 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
460 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
461 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
462 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
463 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
465 /* CUBIC SPLINE TABLE ELECTROSTATICS */
466 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
467 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
468 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
469 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
470 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
471 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
472 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
473 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
474 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
475 Heps = _mm256_mul_ps(vfeps,H);
476 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
477 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
478 velec = _mm256_mul_ps(qq10,VV);
479 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
480 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
482 /* Update potential sum for this i atom from the interaction with this j atom. */
483 velecsum = _mm256_add_ps(velecsum,velec);
487 /* Calculate temporary vectorial force */
488 tx = _mm256_mul_ps(fscal,dx10);
489 ty = _mm256_mul_ps(fscal,dy10);
490 tz = _mm256_mul_ps(fscal,dz10);
492 /* Update vectorial force */
493 fix1 = _mm256_add_ps(fix1,tx);
494 fiy1 = _mm256_add_ps(fiy1,ty);
495 fiz1 = _mm256_add_ps(fiz1,tz);
497 fjx0 = _mm256_add_ps(fjx0,tx);
498 fjy0 = _mm256_add_ps(fjy0,ty);
499 fjz0 = _mm256_add_ps(fjz0,tz);
501 /**************************
502 * CALCULATE INTERACTIONS *
503 **************************/
505 r11 = _mm256_mul_ps(rsq11,rinv11);
507 /* Calculate table index by multiplying r with table scale and truncate to integer */
508 rt = _mm256_mul_ps(r11,vftabscale);
509 vfitab = _mm256_cvttps_epi32(rt);
510 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
511 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
512 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
513 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
514 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
515 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
517 /* CUBIC SPLINE TABLE ELECTROSTATICS */
518 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
519 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
520 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
521 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
522 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
523 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
524 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
525 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
526 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
527 Heps = _mm256_mul_ps(vfeps,H);
528 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
529 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
530 velec = _mm256_mul_ps(qq11,VV);
531 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
532 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
534 /* Update potential sum for this i atom from the interaction with this j atom. */
535 velecsum = _mm256_add_ps(velecsum,velec);
539 /* Calculate temporary vectorial force */
540 tx = _mm256_mul_ps(fscal,dx11);
541 ty = _mm256_mul_ps(fscal,dy11);
542 tz = _mm256_mul_ps(fscal,dz11);
544 /* Update vectorial force */
545 fix1 = _mm256_add_ps(fix1,tx);
546 fiy1 = _mm256_add_ps(fiy1,ty);
547 fiz1 = _mm256_add_ps(fiz1,tz);
549 fjx1 = _mm256_add_ps(fjx1,tx);
550 fjy1 = _mm256_add_ps(fjy1,ty);
551 fjz1 = _mm256_add_ps(fjz1,tz);
553 /**************************
554 * CALCULATE INTERACTIONS *
555 **************************/
557 r12 = _mm256_mul_ps(rsq12,rinv12);
559 /* Calculate table index by multiplying r with table scale and truncate to integer */
560 rt = _mm256_mul_ps(r12,vftabscale);
561 vfitab = _mm256_cvttps_epi32(rt);
562 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
563 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
564 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
565 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
566 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
567 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
569 /* CUBIC SPLINE TABLE ELECTROSTATICS */
570 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
571 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
572 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
573 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
574 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
575 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
576 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
577 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
578 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
579 Heps = _mm256_mul_ps(vfeps,H);
580 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
581 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
582 velec = _mm256_mul_ps(qq12,VV);
583 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
584 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
586 /* Update potential sum for this i atom from the interaction with this j atom. */
587 velecsum = _mm256_add_ps(velecsum,velec);
591 /* Calculate temporary vectorial force */
592 tx = _mm256_mul_ps(fscal,dx12);
593 ty = _mm256_mul_ps(fscal,dy12);
594 tz = _mm256_mul_ps(fscal,dz12);
596 /* Update vectorial force */
597 fix1 = _mm256_add_ps(fix1,tx);
598 fiy1 = _mm256_add_ps(fiy1,ty);
599 fiz1 = _mm256_add_ps(fiz1,tz);
601 fjx2 = _mm256_add_ps(fjx2,tx);
602 fjy2 = _mm256_add_ps(fjy2,ty);
603 fjz2 = _mm256_add_ps(fjz2,tz);
605 /**************************
606 * CALCULATE INTERACTIONS *
607 **************************/
609 r20 = _mm256_mul_ps(rsq20,rinv20);
611 /* Calculate table index by multiplying r with table scale and truncate to integer */
612 rt = _mm256_mul_ps(r20,vftabscale);
613 vfitab = _mm256_cvttps_epi32(rt);
614 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
615 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
616 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
617 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
618 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
619 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
621 /* CUBIC SPLINE TABLE ELECTROSTATICS */
622 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
623 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
624 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
625 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
626 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
627 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
628 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
629 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
630 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
631 Heps = _mm256_mul_ps(vfeps,H);
632 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
633 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
634 velec = _mm256_mul_ps(qq20,VV);
635 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
636 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
638 /* Update potential sum for this i atom from the interaction with this j atom. */
639 velecsum = _mm256_add_ps(velecsum,velec);
643 /* Calculate temporary vectorial force */
644 tx = _mm256_mul_ps(fscal,dx20);
645 ty = _mm256_mul_ps(fscal,dy20);
646 tz = _mm256_mul_ps(fscal,dz20);
648 /* Update vectorial force */
649 fix2 = _mm256_add_ps(fix2,tx);
650 fiy2 = _mm256_add_ps(fiy2,ty);
651 fiz2 = _mm256_add_ps(fiz2,tz);
653 fjx0 = _mm256_add_ps(fjx0,tx);
654 fjy0 = _mm256_add_ps(fjy0,ty);
655 fjz0 = _mm256_add_ps(fjz0,tz);
657 /**************************
658 * CALCULATE INTERACTIONS *
659 **************************/
661 r21 = _mm256_mul_ps(rsq21,rinv21);
663 /* Calculate table index by multiplying r with table scale and truncate to integer */
664 rt = _mm256_mul_ps(r21,vftabscale);
665 vfitab = _mm256_cvttps_epi32(rt);
666 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
667 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
668 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
669 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
670 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
671 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
673 /* CUBIC SPLINE TABLE ELECTROSTATICS */
674 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
675 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
676 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
677 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
678 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
679 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
680 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
681 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
682 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
683 Heps = _mm256_mul_ps(vfeps,H);
684 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
685 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
686 velec = _mm256_mul_ps(qq21,VV);
687 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
688 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
690 /* Update potential sum for this i atom from the interaction with this j atom. */
691 velecsum = _mm256_add_ps(velecsum,velec);
695 /* Calculate temporary vectorial force */
696 tx = _mm256_mul_ps(fscal,dx21);
697 ty = _mm256_mul_ps(fscal,dy21);
698 tz = _mm256_mul_ps(fscal,dz21);
700 /* Update vectorial force */
701 fix2 = _mm256_add_ps(fix2,tx);
702 fiy2 = _mm256_add_ps(fiy2,ty);
703 fiz2 = _mm256_add_ps(fiz2,tz);
705 fjx1 = _mm256_add_ps(fjx1,tx);
706 fjy1 = _mm256_add_ps(fjy1,ty);
707 fjz1 = _mm256_add_ps(fjz1,tz);
709 /**************************
710 * CALCULATE INTERACTIONS *
711 **************************/
713 r22 = _mm256_mul_ps(rsq22,rinv22);
715 /* Calculate table index by multiplying r with table scale and truncate to integer */
716 rt = _mm256_mul_ps(r22,vftabscale);
717 vfitab = _mm256_cvttps_epi32(rt);
718 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
719 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
720 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
721 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
722 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
723 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
725 /* CUBIC SPLINE TABLE ELECTROSTATICS */
726 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
727 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
728 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
729 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
730 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
731 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
732 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
733 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
734 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
735 Heps = _mm256_mul_ps(vfeps,H);
736 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
737 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
738 velec = _mm256_mul_ps(qq22,VV);
739 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
740 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
742 /* Update potential sum for this i atom from the interaction with this j atom. */
743 velecsum = _mm256_add_ps(velecsum,velec);
747 /* Calculate temporary vectorial force */
748 tx = _mm256_mul_ps(fscal,dx22);
749 ty = _mm256_mul_ps(fscal,dy22);
750 tz = _mm256_mul_ps(fscal,dz22);
752 /* Update vectorial force */
753 fix2 = _mm256_add_ps(fix2,tx);
754 fiy2 = _mm256_add_ps(fiy2,ty);
755 fiz2 = _mm256_add_ps(fiz2,tz);
757 fjx2 = _mm256_add_ps(fjx2,tx);
758 fjy2 = _mm256_add_ps(fjy2,ty);
759 fjz2 = _mm256_add_ps(fjz2,tz);
761 fjptrA = f+j_coord_offsetA;
762 fjptrB = f+j_coord_offsetB;
763 fjptrC = f+j_coord_offsetC;
764 fjptrD = f+j_coord_offsetD;
765 fjptrE = f+j_coord_offsetE;
766 fjptrF = f+j_coord_offsetF;
767 fjptrG = f+j_coord_offsetG;
768 fjptrH = f+j_coord_offsetH;
770 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
771 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
773 /* Inner loop uses 387 flops */
779 /* Get j neighbor index, and coordinate index */
780 jnrlistA = jjnr[jidx];
781 jnrlistB = jjnr[jidx+1];
782 jnrlistC = jjnr[jidx+2];
783 jnrlistD = jjnr[jidx+3];
784 jnrlistE = jjnr[jidx+4];
785 jnrlistF = jjnr[jidx+5];
786 jnrlistG = jjnr[jidx+6];
787 jnrlistH = jjnr[jidx+7];
788 /* Sign of each element will be negative for non-real atoms.
789 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
790 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
792 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
793 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
795 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
796 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
797 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
798 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
799 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
800 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
801 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
802 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
803 j_coord_offsetA = DIM*jnrA;
804 j_coord_offsetB = DIM*jnrB;
805 j_coord_offsetC = DIM*jnrC;
806 j_coord_offsetD = DIM*jnrD;
807 j_coord_offsetE = DIM*jnrE;
808 j_coord_offsetF = DIM*jnrF;
809 j_coord_offsetG = DIM*jnrG;
810 j_coord_offsetH = DIM*jnrH;
812 /* load j atom coordinates */
813 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
814 x+j_coord_offsetC,x+j_coord_offsetD,
815 x+j_coord_offsetE,x+j_coord_offsetF,
816 x+j_coord_offsetG,x+j_coord_offsetH,
817 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
819 /* Calculate displacement vector */
820 dx00 = _mm256_sub_ps(ix0,jx0);
821 dy00 = _mm256_sub_ps(iy0,jy0);
822 dz00 = _mm256_sub_ps(iz0,jz0);
823 dx01 = _mm256_sub_ps(ix0,jx1);
824 dy01 = _mm256_sub_ps(iy0,jy1);
825 dz01 = _mm256_sub_ps(iz0,jz1);
826 dx02 = _mm256_sub_ps(ix0,jx2);
827 dy02 = _mm256_sub_ps(iy0,jy2);
828 dz02 = _mm256_sub_ps(iz0,jz2);
829 dx10 = _mm256_sub_ps(ix1,jx0);
830 dy10 = _mm256_sub_ps(iy1,jy0);
831 dz10 = _mm256_sub_ps(iz1,jz0);
832 dx11 = _mm256_sub_ps(ix1,jx1);
833 dy11 = _mm256_sub_ps(iy1,jy1);
834 dz11 = _mm256_sub_ps(iz1,jz1);
835 dx12 = _mm256_sub_ps(ix1,jx2);
836 dy12 = _mm256_sub_ps(iy1,jy2);
837 dz12 = _mm256_sub_ps(iz1,jz2);
838 dx20 = _mm256_sub_ps(ix2,jx0);
839 dy20 = _mm256_sub_ps(iy2,jy0);
840 dz20 = _mm256_sub_ps(iz2,jz0);
841 dx21 = _mm256_sub_ps(ix2,jx1);
842 dy21 = _mm256_sub_ps(iy2,jy1);
843 dz21 = _mm256_sub_ps(iz2,jz1);
844 dx22 = _mm256_sub_ps(ix2,jx2);
845 dy22 = _mm256_sub_ps(iy2,jy2);
846 dz22 = _mm256_sub_ps(iz2,jz2);
848 /* Calculate squared distance and things based on it */
849 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
850 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
851 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
852 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
853 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
854 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
855 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
856 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
857 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
859 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
860 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
861 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
862 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
863 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
864 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
865 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
866 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
867 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
869 fjx0 = _mm256_setzero_ps();
870 fjy0 = _mm256_setzero_ps();
871 fjz0 = _mm256_setzero_ps();
872 fjx1 = _mm256_setzero_ps();
873 fjy1 = _mm256_setzero_ps();
874 fjz1 = _mm256_setzero_ps();
875 fjx2 = _mm256_setzero_ps();
876 fjy2 = _mm256_setzero_ps();
877 fjz2 = _mm256_setzero_ps();
879 /**************************
880 * CALCULATE INTERACTIONS *
881 **************************/
883 r00 = _mm256_mul_ps(rsq00,rinv00);
884 r00 = _mm256_andnot_ps(dummy_mask,r00);
886 /* Calculate table index by multiplying r with table scale and truncate to integer */
887 rt = _mm256_mul_ps(r00,vftabscale);
888 vfitab = _mm256_cvttps_epi32(rt);
889 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
890 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
891 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
892 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
893 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
894 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
896 /* CUBIC SPLINE TABLE ELECTROSTATICS */
897 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
898 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
899 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
900 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
901 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
902 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
903 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
904 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
905 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
906 Heps = _mm256_mul_ps(vfeps,H);
907 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
908 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
909 velec = _mm256_mul_ps(qq00,VV);
910 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
911 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
913 /* Update potential sum for this i atom from the interaction with this j atom. */
914 velec = _mm256_andnot_ps(dummy_mask,velec);
915 velecsum = _mm256_add_ps(velecsum,velec);
919 fscal = _mm256_andnot_ps(dummy_mask,fscal);
921 /* Calculate temporary vectorial force */
922 tx = _mm256_mul_ps(fscal,dx00);
923 ty = _mm256_mul_ps(fscal,dy00);
924 tz = _mm256_mul_ps(fscal,dz00);
926 /* Update vectorial force */
927 fix0 = _mm256_add_ps(fix0,tx);
928 fiy0 = _mm256_add_ps(fiy0,ty);
929 fiz0 = _mm256_add_ps(fiz0,tz);
931 fjx0 = _mm256_add_ps(fjx0,tx);
932 fjy0 = _mm256_add_ps(fjy0,ty);
933 fjz0 = _mm256_add_ps(fjz0,tz);
935 /**************************
936 * CALCULATE INTERACTIONS *
937 **************************/
939 r01 = _mm256_mul_ps(rsq01,rinv01);
940 r01 = _mm256_andnot_ps(dummy_mask,r01);
942 /* Calculate table index by multiplying r with table scale and truncate to integer */
943 rt = _mm256_mul_ps(r01,vftabscale);
944 vfitab = _mm256_cvttps_epi32(rt);
945 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
946 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
947 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
948 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
949 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
950 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
952 /* CUBIC SPLINE TABLE ELECTROSTATICS */
953 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
954 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
955 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
956 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
957 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
958 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
959 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
960 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
961 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
962 Heps = _mm256_mul_ps(vfeps,H);
963 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
964 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
965 velec = _mm256_mul_ps(qq01,VV);
966 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
967 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
969 /* Update potential sum for this i atom from the interaction with this j atom. */
970 velec = _mm256_andnot_ps(dummy_mask,velec);
971 velecsum = _mm256_add_ps(velecsum,velec);
975 fscal = _mm256_andnot_ps(dummy_mask,fscal);
977 /* Calculate temporary vectorial force */
978 tx = _mm256_mul_ps(fscal,dx01);
979 ty = _mm256_mul_ps(fscal,dy01);
980 tz = _mm256_mul_ps(fscal,dz01);
982 /* Update vectorial force */
983 fix0 = _mm256_add_ps(fix0,tx);
984 fiy0 = _mm256_add_ps(fiy0,ty);
985 fiz0 = _mm256_add_ps(fiz0,tz);
987 fjx1 = _mm256_add_ps(fjx1,tx);
988 fjy1 = _mm256_add_ps(fjy1,ty);
989 fjz1 = _mm256_add_ps(fjz1,tz);
991 /**************************
992 * CALCULATE INTERACTIONS *
993 **************************/
995 r02 = _mm256_mul_ps(rsq02,rinv02);
996 r02 = _mm256_andnot_ps(dummy_mask,r02);
998 /* Calculate table index by multiplying r with table scale and truncate to integer */
999 rt = _mm256_mul_ps(r02,vftabscale);
1000 vfitab = _mm256_cvttps_epi32(rt);
1001 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1002 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1003 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1004 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1005 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1006 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1008 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1009 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1010 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1011 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1012 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1013 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1014 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1015 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1016 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1017 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1018 Heps = _mm256_mul_ps(vfeps,H);
1019 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1020 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1021 velec = _mm256_mul_ps(qq02,VV);
1022 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1023 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1025 /* Update potential sum for this i atom from the interaction with this j atom. */
1026 velec = _mm256_andnot_ps(dummy_mask,velec);
1027 velecsum = _mm256_add_ps(velecsum,velec);
1031 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1033 /* Calculate temporary vectorial force */
1034 tx = _mm256_mul_ps(fscal,dx02);
1035 ty = _mm256_mul_ps(fscal,dy02);
1036 tz = _mm256_mul_ps(fscal,dz02);
1038 /* Update vectorial force */
1039 fix0 = _mm256_add_ps(fix0,tx);
1040 fiy0 = _mm256_add_ps(fiy0,ty);
1041 fiz0 = _mm256_add_ps(fiz0,tz);
1043 fjx2 = _mm256_add_ps(fjx2,tx);
1044 fjy2 = _mm256_add_ps(fjy2,ty);
1045 fjz2 = _mm256_add_ps(fjz2,tz);
1047 /**************************
1048 * CALCULATE INTERACTIONS *
1049 **************************/
1051 r10 = _mm256_mul_ps(rsq10,rinv10);
1052 r10 = _mm256_andnot_ps(dummy_mask,r10);
1054 /* Calculate table index by multiplying r with table scale and truncate to integer */
1055 rt = _mm256_mul_ps(r10,vftabscale);
1056 vfitab = _mm256_cvttps_epi32(rt);
1057 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1058 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1059 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1060 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1061 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1062 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1064 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1065 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1066 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1067 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1068 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1069 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1070 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1071 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1072 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1073 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1074 Heps = _mm256_mul_ps(vfeps,H);
1075 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1076 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1077 velec = _mm256_mul_ps(qq10,VV);
1078 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1079 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1081 /* Update potential sum for this i atom from the interaction with this j atom. */
1082 velec = _mm256_andnot_ps(dummy_mask,velec);
1083 velecsum = _mm256_add_ps(velecsum,velec);
1087 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1089 /* Calculate temporary vectorial force */
1090 tx = _mm256_mul_ps(fscal,dx10);
1091 ty = _mm256_mul_ps(fscal,dy10);
1092 tz = _mm256_mul_ps(fscal,dz10);
1094 /* Update vectorial force */
1095 fix1 = _mm256_add_ps(fix1,tx);
1096 fiy1 = _mm256_add_ps(fiy1,ty);
1097 fiz1 = _mm256_add_ps(fiz1,tz);
1099 fjx0 = _mm256_add_ps(fjx0,tx);
1100 fjy0 = _mm256_add_ps(fjy0,ty);
1101 fjz0 = _mm256_add_ps(fjz0,tz);
1103 /**************************
1104 * CALCULATE INTERACTIONS *
1105 **************************/
1107 r11 = _mm256_mul_ps(rsq11,rinv11);
1108 r11 = _mm256_andnot_ps(dummy_mask,r11);
1110 /* Calculate table index by multiplying r with table scale and truncate to integer */
1111 rt = _mm256_mul_ps(r11,vftabscale);
1112 vfitab = _mm256_cvttps_epi32(rt);
1113 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1114 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1115 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1116 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1117 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1118 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1120 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1121 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1122 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1123 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1124 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1125 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1126 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1127 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1128 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1129 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1130 Heps = _mm256_mul_ps(vfeps,H);
1131 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1132 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1133 velec = _mm256_mul_ps(qq11,VV);
1134 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1135 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1137 /* Update potential sum for this i atom from the interaction with this j atom. */
1138 velec = _mm256_andnot_ps(dummy_mask,velec);
1139 velecsum = _mm256_add_ps(velecsum,velec);
1143 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1145 /* Calculate temporary vectorial force */
1146 tx = _mm256_mul_ps(fscal,dx11);
1147 ty = _mm256_mul_ps(fscal,dy11);
1148 tz = _mm256_mul_ps(fscal,dz11);
1150 /* Update vectorial force */
1151 fix1 = _mm256_add_ps(fix1,tx);
1152 fiy1 = _mm256_add_ps(fiy1,ty);
1153 fiz1 = _mm256_add_ps(fiz1,tz);
1155 fjx1 = _mm256_add_ps(fjx1,tx);
1156 fjy1 = _mm256_add_ps(fjy1,ty);
1157 fjz1 = _mm256_add_ps(fjz1,tz);
1159 /**************************
1160 * CALCULATE INTERACTIONS *
1161 **************************/
1163 r12 = _mm256_mul_ps(rsq12,rinv12);
1164 r12 = _mm256_andnot_ps(dummy_mask,r12);
1166 /* Calculate table index by multiplying r with table scale and truncate to integer */
1167 rt = _mm256_mul_ps(r12,vftabscale);
1168 vfitab = _mm256_cvttps_epi32(rt);
1169 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1170 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1171 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1172 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1173 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1174 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1176 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1177 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1178 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1179 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1180 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1181 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1182 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1183 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1184 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1185 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1186 Heps = _mm256_mul_ps(vfeps,H);
1187 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1188 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1189 velec = _mm256_mul_ps(qq12,VV);
1190 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1191 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1193 /* Update potential sum for this i atom from the interaction with this j atom. */
1194 velec = _mm256_andnot_ps(dummy_mask,velec);
1195 velecsum = _mm256_add_ps(velecsum,velec);
1199 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1201 /* Calculate temporary vectorial force */
1202 tx = _mm256_mul_ps(fscal,dx12);
1203 ty = _mm256_mul_ps(fscal,dy12);
1204 tz = _mm256_mul_ps(fscal,dz12);
1206 /* Update vectorial force */
1207 fix1 = _mm256_add_ps(fix1,tx);
1208 fiy1 = _mm256_add_ps(fiy1,ty);
1209 fiz1 = _mm256_add_ps(fiz1,tz);
1211 fjx2 = _mm256_add_ps(fjx2,tx);
1212 fjy2 = _mm256_add_ps(fjy2,ty);
1213 fjz2 = _mm256_add_ps(fjz2,tz);
1215 /**************************
1216 * CALCULATE INTERACTIONS *
1217 **************************/
1219 r20 = _mm256_mul_ps(rsq20,rinv20);
1220 r20 = _mm256_andnot_ps(dummy_mask,r20);
1222 /* Calculate table index by multiplying r with table scale and truncate to integer */
1223 rt = _mm256_mul_ps(r20,vftabscale);
1224 vfitab = _mm256_cvttps_epi32(rt);
1225 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1226 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1227 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1228 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1229 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1230 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1232 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1233 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1234 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1235 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1236 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1237 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1238 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1239 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1240 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1241 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1242 Heps = _mm256_mul_ps(vfeps,H);
1243 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1244 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1245 velec = _mm256_mul_ps(qq20,VV);
1246 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1247 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1249 /* Update potential sum for this i atom from the interaction with this j atom. */
1250 velec = _mm256_andnot_ps(dummy_mask,velec);
1251 velecsum = _mm256_add_ps(velecsum,velec);
1255 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1257 /* Calculate temporary vectorial force */
1258 tx = _mm256_mul_ps(fscal,dx20);
1259 ty = _mm256_mul_ps(fscal,dy20);
1260 tz = _mm256_mul_ps(fscal,dz20);
1262 /* Update vectorial force */
1263 fix2 = _mm256_add_ps(fix2,tx);
1264 fiy2 = _mm256_add_ps(fiy2,ty);
1265 fiz2 = _mm256_add_ps(fiz2,tz);
1267 fjx0 = _mm256_add_ps(fjx0,tx);
1268 fjy0 = _mm256_add_ps(fjy0,ty);
1269 fjz0 = _mm256_add_ps(fjz0,tz);
1271 /**************************
1272 * CALCULATE INTERACTIONS *
1273 **************************/
1275 r21 = _mm256_mul_ps(rsq21,rinv21);
1276 r21 = _mm256_andnot_ps(dummy_mask,r21);
1278 /* Calculate table index by multiplying r with table scale and truncate to integer */
1279 rt = _mm256_mul_ps(r21,vftabscale);
1280 vfitab = _mm256_cvttps_epi32(rt);
1281 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1282 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1283 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1284 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1285 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1286 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1288 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1289 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1290 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1291 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1292 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1293 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1294 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1295 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1296 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1297 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1298 Heps = _mm256_mul_ps(vfeps,H);
1299 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1300 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1301 velec = _mm256_mul_ps(qq21,VV);
1302 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1303 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1305 /* Update potential sum for this i atom from the interaction with this j atom. */
1306 velec = _mm256_andnot_ps(dummy_mask,velec);
1307 velecsum = _mm256_add_ps(velecsum,velec);
1311 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1313 /* Calculate temporary vectorial force */
1314 tx = _mm256_mul_ps(fscal,dx21);
1315 ty = _mm256_mul_ps(fscal,dy21);
1316 tz = _mm256_mul_ps(fscal,dz21);
1318 /* Update vectorial force */
1319 fix2 = _mm256_add_ps(fix2,tx);
1320 fiy2 = _mm256_add_ps(fiy2,ty);
1321 fiz2 = _mm256_add_ps(fiz2,tz);
1323 fjx1 = _mm256_add_ps(fjx1,tx);
1324 fjy1 = _mm256_add_ps(fjy1,ty);
1325 fjz1 = _mm256_add_ps(fjz1,tz);
1327 /**************************
1328 * CALCULATE INTERACTIONS *
1329 **************************/
1331 r22 = _mm256_mul_ps(rsq22,rinv22);
1332 r22 = _mm256_andnot_ps(dummy_mask,r22);
1334 /* Calculate table index by multiplying r with table scale and truncate to integer */
1335 rt = _mm256_mul_ps(r22,vftabscale);
1336 vfitab = _mm256_cvttps_epi32(rt);
1337 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1338 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1339 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1340 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1341 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1342 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1344 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1345 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1346 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1347 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1348 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1349 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1350 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1351 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1352 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1353 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1354 Heps = _mm256_mul_ps(vfeps,H);
1355 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1356 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1357 velec = _mm256_mul_ps(qq22,VV);
1358 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1359 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1361 /* Update potential sum for this i atom from the interaction with this j atom. */
1362 velec = _mm256_andnot_ps(dummy_mask,velec);
1363 velecsum = _mm256_add_ps(velecsum,velec);
1367 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1369 /* Calculate temporary vectorial force */
1370 tx = _mm256_mul_ps(fscal,dx22);
1371 ty = _mm256_mul_ps(fscal,dy22);
1372 tz = _mm256_mul_ps(fscal,dz22);
1374 /* Update vectorial force */
1375 fix2 = _mm256_add_ps(fix2,tx);
1376 fiy2 = _mm256_add_ps(fiy2,ty);
1377 fiz2 = _mm256_add_ps(fiz2,tz);
1379 fjx2 = _mm256_add_ps(fjx2,tx);
1380 fjy2 = _mm256_add_ps(fjy2,ty);
1381 fjz2 = _mm256_add_ps(fjz2,tz);
1383 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1384 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1385 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1386 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1387 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1388 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1389 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1390 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1392 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1393 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1395 /* Inner loop uses 396 flops */
1398 /* End of innermost loop */
1400 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1401 f+i_coord_offset,fshift+i_shift_offset);
1404 /* Update potential energies */
1405 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1407 /* Increment number of inner iterations */
1408 inneriter += j_index_end - j_index_start;
1410 /* Outer loop uses 19 flops */
1413 /* Increment number of outer iterations */
1416 /* Update outer/inner flops */
1418 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_VF,outeriter*19 + inneriter*396);
1421 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_single
1422 * Electrostatics interaction: CubicSplineTable
1423 * VdW interaction: None
1424 * Geometry: Water3-Water3
1425 * Calculate force/pot: Force
1428 nb_kernel_ElecCSTab_VdwNone_GeomW3W3_F_avx_256_single
1429 (t_nblist * gmx_restrict nlist,
1430 rvec * gmx_restrict xx,
1431 rvec * gmx_restrict ff,
1432 t_forcerec * gmx_restrict fr,
1433 t_mdatoms * gmx_restrict mdatoms,
1434 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1435 t_nrnb * gmx_restrict nrnb)
1437 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1438 * just 0 for non-waters.
1439 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1440 * jnr indices corresponding to data put in the four positions in the SIMD register.
1442 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1443 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1444 int jnrA,jnrB,jnrC,jnrD;
1445 int jnrE,jnrF,jnrG,jnrH;
1446 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1447 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1448 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1449 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1450 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1451 real rcutoff_scalar;
1452 real *shiftvec,*fshift,*x,*f;
1453 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1454 real scratch[4*DIM];
1455 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1456 real * vdwioffsetptr0;
1457 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1458 real * vdwioffsetptr1;
1459 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1460 real * vdwioffsetptr2;
1461 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1462 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1463 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1464 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1465 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1466 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1467 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1468 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1469 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1470 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1471 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1472 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1473 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1474 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1475 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1476 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1477 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1480 __m128i vfitab_lo,vfitab_hi;
1481 __m128i ifour = _mm_set1_epi32(4);
1482 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1484 __m256 dummy_mask,cutoff_mask;
1485 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1486 __m256 one = _mm256_set1_ps(1.0);
1487 __m256 two = _mm256_set1_ps(2.0);
1493 jindex = nlist->jindex;
1495 shiftidx = nlist->shift;
1497 shiftvec = fr->shift_vec[0];
1498 fshift = fr->fshift[0];
1499 facel = _mm256_set1_ps(fr->epsfac);
1500 charge = mdatoms->chargeA;
1502 vftab = kernel_data->table_elec->data;
1503 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
1505 /* Setup water-specific parameters */
1506 inr = nlist->iinr[0];
1507 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1508 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1509 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1511 jq0 = _mm256_set1_ps(charge[inr+0]);
1512 jq1 = _mm256_set1_ps(charge[inr+1]);
1513 jq2 = _mm256_set1_ps(charge[inr+2]);
1514 qq00 = _mm256_mul_ps(iq0,jq0);
1515 qq01 = _mm256_mul_ps(iq0,jq1);
1516 qq02 = _mm256_mul_ps(iq0,jq2);
1517 qq10 = _mm256_mul_ps(iq1,jq0);
1518 qq11 = _mm256_mul_ps(iq1,jq1);
1519 qq12 = _mm256_mul_ps(iq1,jq2);
1520 qq20 = _mm256_mul_ps(iq2,jq0);
1521 qq21 = _mm256_mul_ps(iq2,jq1);
1522 qq22 = _mm256_mul_ps(iq2,jq2);
1524 /* Avoid stupid compiler warnings */
1525 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1526 j_coord_offsetA = 0;
1527 j_coord_offsetB = 0;
1528 j_coord_offsetC = 0;
1529 j_coord_offsetD = 0;
1530 j_coord_offsetE = 0;
1531 j_coord_offsetF = 0;
1532 j_coord_offsetG = 0;
1533 j_coord_offsetH = 0;
1538 for(iidx=0;iidx<4*DIM;iidx++)
1540 scratch[iidx] = 0.0;
1543 /* Start outer loop over neighborlists */
1544 for(iidx=0; iidx<nri; iidx++)
1546 /* Load shift vector for this list */
1547 i_shift_offset = DIM*shiftidx[iidx];
1549 /* Load limits for loop over neighbors */
1550 j_index_start = jindex[iidx];
1551 j_index_end = jindex[iidx+1];
1553 /* Get outer coordinate index */
1555 i_coord_offset = DIM*inr;
1557 /* Load i particle coords and add shift vector */
1558 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1559 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1561 fix0 = _mm256_setzero_ps();
1562 fiy0 = _mm256_setzero_ps();
1563 fiz0 = _mm256_setzero_ps();
1564 fix1 = _mm256_setzero_ps();
1565 fiy1 = _mm256_setzero_ps();
1566 fiz1 = _mm256_setzero_ps();
1567 fix2 = _mm256_setzero_ps();
1568 fiy2 = _mm256_setzero_ps();
1569 fiz2 = _mm256_setzero_ps();
1571 /* Start inner kernel loop */
1572 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1575 /* Get j neighbor index, and coordinate index */
1577 jnrB = jjnr[jidx+1];
1578 jnrC = jjnr[jidx+2];
1579 jnrD = jjnr[jidx+3];
1580 jnrE = jjnr[jidx+4];
1581 jnrF = jjnr[jidx+5];
1582 jnrG = jjnr[jidx+6];
1583 jnrH = jjnr[jidx+7];
1584 j_coord_offsetA = DIM*jnrA;
1585 j_coord_offsetB = DIM*jnrB;
1586 j_coord_offsetC = DIM*jnrC;
1587 j_coord_offsetD = DIM*jnrD;
1588 j_coord_offsetE = DIM*jnrE;
1589 j_coord_offsetF = DIM*jnrF;
1590 j_coord_offsetG = DIM*jnrG;
1591 j_coord_offsetH = DIM*jnrH;
1593 /* load j atom coordinates */
1594 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1595 x+j_coord_offsetC,x+j_coord_offsetD,
1596 x+j_coord_offsetE,x+j_coord_offsetF,
1597 x+j_coord_offsetG,x+j_coord_offsetH,
1598 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1600 /* Calculate displacement vector */
1601 dx00 = _mm256_sub_ps(ix0,jx0);
1602 dy00 = _mm256_sub_ps(iy0,jy0);
1603 dz00 = _mm256_sub_ps(iz0,jz0);
1604 dx01 = _mm256_sub_ps(ix0,jx1);
1605 dy01 = _mm256_sub_ps(iy0,jy1);
1606 dz01 = _mm256_sub_ps(iz0,jz1);
1607 dx02 = _mm256_sub_ps(ix0,jx2);
1608 dy02 = _mm256_sub_ps(iy0,jy2);
1609 dz02 = _mm256_sub_ps(iz0,jz2);
1610 dx10 = _mm256_sub_ps(ix1,jx0);
1611 dy10 = _mm256_sub_ps(iy1,jy0);
1612 dz10 = _mm256_sub_ps(iz1,jz0);
1613 dx11 = _mm256_sub_ps(ix1,jx1);
1614 dy11 = _mm256_sub_ps(iy1,jy1);
1615 dz11 = _mm256_sub_ps(iz1,jz1);
1616 dx12 = _mm256_sub_ps(ix1,jx2);
1617 dy12 = _mm256_sub_ps(iy1,jy2);
1618 dz12 = _mm256_sub_ps(iz1,jz2);
1619 dx20 = _mm256_sub_ps(ix2,jx0);
1620 dy20 = _mm256_sub_ps(iy2,jy0);
1621 dz20 = _mm256_sub_ps(iz2,jz0);
1622 dx21 = _mm256_sub_ps(ix2,jx1);
1623 dy21 = _mm256_sub_ps(iy2,jy1);
1624 dz21 = _mm256_sub_ps(iz2,jz1);
1625 dx22 = _mm256_sub_ps(ix2,jx2);
1626 dy22 = _mm256_sub_ps(iy2,jy2);
1627 dz22 = _mm256_sub_ps(iz2,jz2);
1629 /* Calculate squared distance and things based on it */
1630 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1631 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1632 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1633 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1634 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1635 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1636 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1637 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1638 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1640 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1641 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1642 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1643 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1644 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1645 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1646 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1647 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1648 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1650 fjx0 = _mm256_setzero_ps();
1651 fjy0 = _mm256_setzero_ps();
1652 fjz0 = _mm256_setzero_ps();
1653 fjx1 = _mm256_setzero_ps();
1654 fjy1 = _mm256_setzero_ps();
1655 fjz1 = _mm256_setzero_ps();
1656 fjx2 = _mm256_setzero_ps();
1657 fjy2 = _mm256_setzero_ps();
1658 fjz2 = _mm256_setzero_ps();
1660 /**************************
1661 * CALCULATE INTERACTIONS *
1662 **************************/
1664 r00 = _mm256_mul_ps(rsq00,rinv00);
1666 /* Calculate table index by multiplying r with table scale and truncate to integer */
1667 rt = _mm256_mul_ps(r00,vftabscale);
1668 vfitab = _mm256_cvttps_epi32(rt);
1669 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1670 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1671 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1672 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1673 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1674 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1676 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1677 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1678 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1679 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1680 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1681 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1682 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1683 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1684 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1685 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1686 Heps = _mm256_mul_ps(vfeps,H);
1687 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1688 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1689 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1693 /* Calculate temporary vectorial force */
1694 tx = _mm256_mul_ps(fscal,dx00);
1695 ty = _mm256_mul_ps(fscal,dy00);
1696 tz = _mm256_mul_ps(fscal,dz00);
1698 /* Update vectorial force */
1699 fix0 = _mm256_add_ps(fix0,tx);
1700 fiy0 = _mm256_add_ps(fiy0,ty);
1701 fiz0 = _mm256_add_ps(fiz0,tz);
1703 fjx0 = _mm256_add_ps(fjx0,tx);
1704 fjy0 = _mm256_add_ps(fjy0,ty);
1705 fjz0 = _mm256_add_ps(fjz0,tz);
1707 /**************************
1708 * CALCULATE INTERACTIONS *
1709 **************************/
1711 r01 = _mm256_mul_ps(rsq01,rinv01);
1713 /* Calculate table index by multiplying r with table scale and truncate to integer */
1714 rt = _mm256_mul_ps(r01,vftabscale);
1715 vfitab = _mm256_cvttps_epi32(rt);
1716 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1717 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1718 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1719 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1720 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1721 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1723 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1724 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1725 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1726 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1727 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1728 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1729 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1730 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1731 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1732 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1733 Heps = _mm256_mul_ps(vfeps,H);
1734 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1735 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1736 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1740 /* Calculate temporary vectorial force */
1741 tx = _mm256_mul_ps(fscal,dx01);
1742 ty = _mm256_mul_ps(fscal,dy01);
1743 tz = _mm256_mul_ps(fscal,dz01);
1745 /* Update vectorial force */
1746 fix0 = _mm256_add_ps(fix0,tx);
1747 fiy0 = _mm256_add_ps(fiy0,ty);
1748 fiz0 = _mm256_add_ps(fiz0,tz);
1750 fjx1 = _mm256_add_ps(fjx1,tx);
1751 fjy1 = _mm256_add_ps(fjy1,ty);
1752 fjz1 = _mm256_add_ps(fjz1,tz);
1754 /**************************
1755 * CALCULATE INTERACTIONS *
1756 **************************/
1758 r02 = _mm256_mul_ps(rsq02,rinv02);
1760 /* Calculate table index by multiplying r with table scale and truncate to integer */
1761 rt = _mm256_mul_ps(r02,vftabscale);
1762 vfitab = _mm256_cvttps_epi32(rt);
1763 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1764 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1765 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1766 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1767 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1768 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1770 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1771 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1772 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1773 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1774 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1775 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1776 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1777 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1778 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1779 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1780 Heps = _mm256_mul_ps(vfeps,H);
1781 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1782 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1783 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1787 /* Calculate temporary vectorial force */
1788 tx = _mm256_mul_ps(fscal,dx02);
1789 ty = _mm256_mul_ps(fscal,dy02);
1790 tz = _mm256_mul_ps(fscal,dz02);
1792 /* Update vectorial force */
1793 fix0 = _mm256_add_ps(fix0,tx);
1794 fiy0 = _mm256_add_ps(fiy0,ty);
1795 fiz0 = _mm256_add_ps(fiz0,tz);
1797 fjx2 = _mm256_add_ps(fjx2,tx);
1798 fjy2 = _mm256_add_ps(fjy2,ty);
1799 fjz2 = _mm256_add_ps(fjz2,tz);
1801 /**************************
1802 * CALCULATE INTERACTIONS *
1803 **************************/
1805 r10 = _mm256_mul_ps(rsq10,rinv10);
1807 /* Calculate table index by multiplying r with table scale and truncate to integer */
1808 rt = _mm256_mul_ps(r10,vftabscale);
1809 vfitab = _mm256_cvttps_epi32(rt);
1810 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1811 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1812 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1813 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1814 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1815 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1817 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1818 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1819 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1820 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1821 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1822 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1823 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1824 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1825 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1826 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1827 Heps = _mm256_mul_ps(vfeps,H);
1828 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1829 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1830 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1834 /* Calculate temporary vectorial force */
1835 tx = _mm256_mul_ps(fscal,dx10);
1836 ty = _mm256_mul_ps(fscal,dy10);
1837 tz = _mm256_mul_ps(fscal,dz10);
1839 /* Update vectorial force */
1840 fix1 = _mm256_add_ps(fix1,tx);
1841 fiy1 = _mm256_add_ps(fiy1,ty);
1842 fiz1 = _mm256_add_ps(fiz1,tz);
1844 fjx0 = _mm256_add_ps(fjx0,tx);
1845 fjy0 = _mm256_add_ps(fjy0,ty);
1846 fjz0 = _mm256_add_ps(fjz0,tz);
1848 /**************************
1849 * CALCULATE INTERACTIONS *
1850 **************************/
1852 r11 = _mm256_mul_ps(rsq11,rinv11);
1854 /* Calculate table index by multiplying r with table scale and truncate to integer */
1855 rt = _mm256_mul_ps(r11,vftabscale);
1856 vfitab = _mm256_cvttps_epi32(rt);
1857 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1858 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1859 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1860 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1861 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1862 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1864 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1865 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1866 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1867 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1868 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1869 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1870 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1871 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1872 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1873 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1874 Heps = _mm256_mul_ps(vfeps,H);
1875 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1876 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1877 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1881 /* Calculate temporary vectorial force */
1882 tx = _mm256_mul_ps(fscal,dx11);
1883 ty = _mm256_mul_ps(fscal,dy11);
1884 tz = _mm256_mul_ps(fscal,dz11);
1886 /* Update vectorial force */
1887 fix1 = _mm256_add_ps(fix1,tx);
1888 fiy1 = _mm256_add_ps(fiy1,ty);
1889 fiz1 = _mm256_add_ps(fiz1,tz);
1891 fjx1 = _mm256_add_ps(fjx1,tx);
1892 fjy1 = _mm256_add_ps(fjy1,ty);
1893 fjz1 = _mm256_add_ps(fjz1,tz);
1895 /**************************
1896 * CALCULATE INTERACTIONS *
1897 **************************/
1899 r12 = _mm256_mul_ps(rsq12,rinv12);
1901 /* Calculate table index by multiplying r with table scale and truncate to integer */
1902 rt = _mm256_mul_ps(r12,vftabscale);
1903 vfitab = _mm256_cvttps_epi32(rt);
1904 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1905 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1906 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1907 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1908 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1909 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1911 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1912 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1913 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1914 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1915 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1916 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1917 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1918 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1919 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1920 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1921 Heps = _mm256_mul_ps(vfeps,H);
1922 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1923 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1924 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1928 /* Calculate temporary vectorial force */
1929 tx = _mm256_mul_ps(fscal,dx12);
1930 ty = _mm256_mul_ps(fscal,dy12);
1931 tz = _mm256_mul_ps(fscal,dz12);
1933 /* Update vectorial force */
1934 fix1 = _mm256_add_ps(fix1,tx);
1935 fiy1 = _mm256_add_ps(fiy1,ty);
1936 fiz1 = _mm256_add_ps(fiz1,tz);
1938 fjx2 = _mm256_add_ps(fjx2,tx);
1939 fjy2 = _mm256_add_ps(fjy2,ty);
1940 fjz2 = _mm256_add_ps(fjz2,tz);
1942 /**************************
1943 * CALCULATE INTERACTIONS *
1944 **************************/
1946 r20 = _mm256_mul_ps(rsq20,rinv20);
1948 /* Calculate table index by multiplying r with table scale and truncate to integer */
1949 rt = _mm256_mul_ps(r20,vftabscale);
1950 vfitab = _mm256_cvttps_epi32(rt);
1951 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1952 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1953 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1954 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1955 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1956 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1958 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1959 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1960 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1961 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1962 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1963 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1964 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1965 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1966 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1967 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1968 Heps = _mm256_mul_ps(vfeps,H);
1969 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1970 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1971 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1975 /* Calculate temporary vectorial force */
1976 tx = _mm256_mul_ps(fscal,dx20);
1977 ty = _mm256_mul_ps(fscal,dy20);
1978 tz = _mm256_mul_ps(fscal,dz20);
1980 /* Update vectorial force */
1981 fix2 = _mm256_add_ps(fix2,tx);
1982 fiy2 = _mm256_add_ps(fiy2,ty);
1983 fiz2 = _mm256_add_ps(fiz2,tz);
1985 fjx0 = _mm256_add_ps(fjx0,tx);
1986 fjy0 = _mm256_add_ps(fjy0,ty);
1987 fjz0 = _mm256_add_ps(fjz0,tz);
1989 /**************************
1990 * CALCULATE INTERACTIONS *
1991 **************************/
1993 r21 = _mm256_mul_ps(rsq21,rinv21);
1995 /* Calculate table index by multiplying r with table scale and truncate to integer */
1996 rt = _mm256_mul_ps(r21,vftabscale);
1997 vfitab = _mm256_cvttps_epi32(rt);
1998 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1999 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2000 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2001 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2002 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2003 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2005 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2006 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2007 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2008 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2009 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2010 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2011 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2012 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2013 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2014 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2015 Heps = _mm256_mul_ps(vfeps,H);
2016 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2017 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2018 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2022 /* Calculate temporary vectorial force */
2023 tx = _mm256_mul_ps(fscal,dx21);
2024 ty = _mm256_mul_ps(fscal,dy21);
2025 tz = _mm256_mul_ps(fscal,dz21);
2027 /* Update vectorial force */
2028 fix2 = _mm256_add_ps(fix2,tx);
2029 fiy2 = _mm256_add_ps(fiy2,ty);
2030 fiz2 = _mm256_add_ps(fiz2,tz);
2032 fjx1 = _mm256_add_ps(fjx1,tx);
2033 fjy1 = _mm256_add_ps(fjy1,ty);
2034 fjz1 = _mm256_add_ps(fjz1,tz);
2036 /**************************
2037 * CALCULATE INTERACTIONS *
2038 **************************/
2040 r22 = _mm256_mul_ps(rsq22,rinv22);
2042 /* Calculate table index by multiplying r with table scale and truncate to integer */
2043 rt = _mm256_mul_ps(r22,vftabscale);
2044 vfitab = _mm256_cvttps_epi32(rt);
2045 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2046 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2047 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2048 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2049 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2050 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2052 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2053 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2054 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2055 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2056 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2057 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2058 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2059 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2060 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2061 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2062 Heps = _mm256_mul_ps(vfeps,H);
2063 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2064 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2065 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2069 /* Calculate temporary vectorial force */
2070 tx = _mm256_mul_ps(fscal,dx22);
2071 ty = _mm256_mul_ps(fscal,dy22);
2072 tz = _mm256_mul_ps(fscal,dz22);
2074 /* Update vectorial force */
2075 fix2 = _mm256_add_ps(fix2,tx);
2076 fiy2 = _mm256_add_ps(fiy2,ty);
2077 fiz2 = _mm256_add_ps(fiz2,tz);
2079 fjx2 = _mm256_add_ps(fjx2,tx);
2080 fjy2 = _mm256_add_ps(fjy2,ty);
2081 fjz2 = _mm256_add_ps(fjz2,tz);
2083 fjptrA = f+j_coord_offsetA;
2084 fjptrB = f+j_coord_offsetB;
2085 fjptrC = f+j_coord_offsetC;
2086 fjptrD = f+j_coord_offsetD;
2087 fjptrE = f+j_coord_offsetE;
2088 fjptrF = f+j_coord_offsetF;
2089 fjptrG = f+j_coord_offsetG;
2090 fjptrH = f+j_coord_offsetH;
2092 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2093 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2095 /* Inner loop uses 351 flops */
2098 if(jidx<j_index_end)
2101 /* Get j neighbor index, and coordinate index */
2102 jnrlistA = jjnr[jidx];
2103 jnrlistB = jjnr[jidx+1];
2104 jnrlistC = jjnr[jidx+2];
2105 jnrlistD = jjnr[jidx+3];
2106 jnrlistE = jjnr[jidx+4];
2107 jnrlistF = jjnr[jidx+5];
2108 jnrlistG = jjnr[jidx+6];
2109 jnrlistH = jjnr[jidx+7];
2110 /* Sign of each element will be negative for non-real atoms.
2111 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2112 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2114 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2115 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2117 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2118 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2119 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2120 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2121 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2122 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2123 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2124 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2125 j_coord_offsetA = DIM*jnrA;
2126 j_coord_offsetB = DIM*jnrB;
2127 j_coord_offsetC = DIM*jnrC;
2128 j_coord_offsetD = DIM*jnrD;
2129 j_coord_offsetE = DIM*jnrE;
2130 j_coord_offsetF = DIM*jnrF;
2131 j_coord_offsetG = DIM*jnrG;
2132 j_coord_offsetH = DIM*jnrH;
2134 /* load j atom coordinates */
2135 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2136 x+j_coord_offsetC,x+j_coord_offsetD,
2137 x+j_coord_offsetE,x+j_coord_offsetF,
2138 x+j_coord_offsetG,x+j_coord_offsetH,
2139 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2141 /* Calculate displacement vector */
2142 dx00 = _mm256_sub_ps(ix0,jx0);
2143 dy00 = _mm256_sub_ps(iy0,jy0);
2144 dz00 = _mm256_sub_ps(iz0,jz0);
2145 dx01 = _mm256_sub_ps(ix0,jx1);
2146 dy01 = _mm256_sub_ps(iy0,jy1);
2147 dz01 = _mm256_sub_ps(iz0,jz1);
2148 dx02 = _mm256_sub_ps(ix0,jx2);
2149 dy02 = _mm256_sub_ps(iy0,jy2);
2150 dz02 = _mm256_sub_ps(iz0,jz2);
2151 dx10 = _mm256_sub_ps(ix1,jx0);
2152 dy10 = _mm256_sub_ps(iy1,jy0);
2153 dz10 = _mm256_sub_ps(iz1,jz0);
2154 dx11 = _mm256_sub_ps(ix1,jx1);
2155 dy11 = _mm256_sub_ps(iy1,jy1);
2156 dz11 = _mm256_sub_ps(iz1,jz1);
2157 dx12 = _mm256_sub_ps(ix1,jx2);
2158 dy12 = _mm256_sub_ps(iy1,jy2);
2159 dz12 = _mm256_sub_ps(iz1,jz2);
2160 dx20 = _mm256_sub_ps(ix2,jx0);
2161 dy20 = _mm256_sub_ps(iy2,jy0);
2162 dz20 = _mm256_sub_ps(iz2,jz0);
2163 dx21 = _mm256_sub_ps(ix2,jx1);
2164 dy21 = _mm256_sub_ps(iy2,jy1);
2165 dz21 = _mm256_sub_ps(iz2,jz1);
2166 dx22 = _mm256_sub_ps(ix2,jx2);
2167 dy22 = _mm256_sub_ps(iy2,jy2);
2168 dz22 = _mm256_sub_ps(iz2,jz2);
2170 /* Calculate squared distance and things based on it */
2171 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2172 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2173 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2174 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2175 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2176 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2177 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2178 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2179 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2181 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2182 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
2183 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
2184 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
2185 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2186 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2187 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
2188 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2189 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2191 fjx0 = _mm256_setzero_ps();
2192 fjy0 = _mm256_setzero_ps();
2193 fjz0 = _mm256_setzero_ps();
2194 fjx1 = _mm256_setzero_ps();
2195 fjy1 = _mm256_setzero_ps();
2196 fjz1 = _mm256_setzero_ps();
2197 fjx2 = _mm256_setzero_ps();
2198 fjy2 = _mm256_setzero_ps();
2199 fjz2 = _mm256_setzero_ps();
2201 /**************************
2202 * CALCULATE INTERACTIONS *
2203 **************************/
2205 r00 = _mm256_mul_ps(rsq00,rinv00);
2206 r00 = _mm256_andnot_ps(dummy_mask,r00);
2208 /* Calculate table index by multiplying r with table scale and truncate to integer */
2209 rt = _mm256_mul_ps(r00,vftabscale);
2210 vfitab = _mm256_cvttps_epi32(rt);
2211 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2212 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2213 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2214 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2215 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2216 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2218 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2219 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2220 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2221 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2222 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2223 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2224 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2225 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2226 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2227 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2228 Heps = _mm256_mul_ps(vfeps,H);
2229 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2230 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2231 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2235 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2237 /* Calculate temporary vectorial force */
2238 tx = _mm256_mul_ps(fscal,dx00);
2239 ty = _mm256_mul_ps(fscal,dy00);
2240 tz = _mm256_mul_ps(fscal,dz00);
2242 /* Update vectorial force */
2243 fix0 = _mm256_add_ps(fix0,tx);
2244 fiy0 = _mm256_add_ps(fiy0,ty);
2245 fiz0 = _mm256_add_ps(fiz0,tz);
2247 fjx0 = _mm256_add_ps(fjx0,tx);
2248 fjy0 = _mm256_add_ps(fjy0,ty);
2249 fjz0 = _mm256_add_ps(fjz0,tz);
2251 /**************************
2252 * CALCULATE INTERACTIONS *
2253 **************************/
2255 r01 = _mm256_mul_ps(rsq01,rinv01);
2256 r01 = _mm256_andnot_ps(dummy_mask,r01);
2258 /* Calculate table index by multiplying r with table scale and truncate to integer */
2259 rt = _mm256_mul_ps(r01,vftabscale);
2260 vfitab = _mm256_cvttps_epi32(rt);
2261 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2262 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2263 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2264 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2265 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2266 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2268 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2269 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2270 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2271 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2272 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2273 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2274 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2275 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2276 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2277 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2278 Heps = _mm256_mul_ps(vfeps,H);
2279 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2280 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2281 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2285 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2287 /* Calculate temporary vectorial force */
2288 tx = _mm256_mul_ps(fscal,dx01);
2289 ty = _mm256_mul_ps(fscal,dy01);
2290 tz = _mm256_mul_ps(fscal,dz01);
2292 /* Update vectorial force */
2293 fix0 = _mm256_add_ps(fix0,tx);
2294 fiy0 = _mm256_add_ps(fiy0,ty);
2295 fiz0 = _mm256_add_ps(fiz0,tz);
2297 fjx1 = _mm256_add_ps(fjx1,tx);
2298 fjy1 = _mm256_add_ps(fjy1,ty);
2299 fjz1 = _mm256_add_ps(fjz1,tz);
2301 /**************************
2302 * CALCULATE INTERACTIONS *
2303 **************************/
2305 r02 = _mm256_mul_ps(rsq02,rinv02);
2306 r02 = _mm256_andnot_ps(dummy_mask,r02);
2308 /* Calculate table index by multiplying r with table scale and truncate to integer */
2309 rt = _mm256_mul_ps(r02,vftabscale);
2310 vfitab = _mm256_cvttps_epi32(rt);
2311 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2312 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2313 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2314 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2315 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2316 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2318 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2319 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2320 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2321 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2322 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2323 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2324 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2325 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2326 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2327 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2328 Heps = _mm256_mul_ps(vfeps,H);
2329 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2330 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2331 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2335 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2337 /* Calculate temporary vectorial force */
2338 tx = _mm256_mul_ps(fscal,dx02);
2339 ty = _mm256_mul_ps(fscal,dy02);
2340 tz = _mm256_mul_ps(fscal,dz02);
2342 /* Update vectorial force */
2343 fix0 = _mm256_add_ps(fix0,tx);
2344 fiy0 = _mm256_add_ps(fiy0,ty);
2345 fiz0 = _mm256_add_ps(fiz0,tz);
2347 fjx2 = _mm256_add_ps(fjx2,tx);
2348 fjy2 = _mm256_add_ps(fjy2,ty);
2349 fjz2 = _mm256_add_ps(fjz2,tz);
2351 /**************************
2352 * CALCULATE INTERACTIONS *
2353 **************************/
2355 r10 = _mm256_mul_ps(rsq10,rinv10);
2356 r10 = _mm256_andnot_ps(dummy_mask,r10);
2358 /* Calculate table index by multiplying r with table scale and truncate to integer */
2359 rt = _mm256_mul_ps(r10,vftabscale);
2360 vfitab = _mm256_cvttps_epi32(rt);
2361 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2362 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2363 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2364 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2365 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2366 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2368 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2369 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2370 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2371 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2372 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2373 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2374 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2375 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2376 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2377 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2378 Heps = _mm256_mul_ps(vfeps,H);
2379 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2380 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2381 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2385 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2387 /* Calculate temporary vectorial force */
2388 tx = _mm256_mul_ps(fscal,dx10);
2389 ty = _mm256_mul_ps(fscal,dy10);
2390 tz = _mm256_mul_ps(fscal,dz10);
2392 /* Update vectorial force */
2393 fix1 = _mm256_add_ps(fix1,tx);
2394 fiy1 = _mm256_add_ps(fiy1,ty);
2395 fiz1 = _mm256_add_ps(fiz1,tz);
2397 fjx0 = _mm256_add_ps(fjx0,tx);
2398 fjy0 = _mm256_add_ps(fjy0,ty);
2399 fjz0 = _mm256_add_ps(fjz0,tz);
2401 /**************************
2402 * CALCULATE INTERACTIONS *
2403 **************************/
2405 r11 = _mm256_mul_ps(rsq11,rinv11);
2406 r11 = _mm256_andnot_ps(dummy_mask,r11);
2408 /* Calculate table index by multiplying r with table scale and truncate to integer */
2409 rt = _mm256_mul_ps(r11,vftabscale);
2410 vfitab = _mm256_cvttps_epi32(rt);
2411 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2412 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2413 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2414 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2415 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2416 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2418 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2419 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2420 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2421 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2422 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2423 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2424 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2425 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2426 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2427 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2428 Heps = _mm256_mul_ps(vfeps,H);
2429 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2430 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2431 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2435 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2437 /* Calculate temporary vectorial force */
2438 tx = _mm256_mul_ps(fscal,dx11);
2439 ty = _mm256_mul_ps(fscal,dy11);
2440 tz = _mm256_mul_ps(fscal,dz11);
2442 /* Update vectorial force */
2443 fix1 = _mm256_add_ps(fix1,tx);
2444 fiy1 = _mm256_add_ps(fiy1,ty);
2445 fiz1 = _mm256_add_ps(fiz1,tz);
2447 fjx1 = _mm256_add_ps(fjx1,tx);
2448 fjy1 = _mm256_add_ps(fjy1,ty);
2449 fjz1 = _mm256_add_ps(fjz1,tz);
2451 /**************************
2452 * CALCULATE INTERACTIONS *
2453 **************************/
2455 r12 = _mm256_mul_ps(rsq12,rinv12);
2456 r12 = _mm256_andnot_ps(dummy_mask,r12);
2458 /* Calculate table index by multiplying r with table scale and truncate to integer */
2459 rt = _mm256_mul_ps(r12,vftabscale);
2460 vfitab = _mm256_cvttps_epi32(rt);
2461 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2462 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2463 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2464 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2465 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2466 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2468 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2469 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2470 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2471 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2472 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2473 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2474 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2475 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2476 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2477 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2478 Heps = _mm256_mul_ps(vfeps,H);
2479 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2480 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2481 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2485 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2487 /* Calculate temporary vectorial force */
2488 tx = _mm256_mul_ps(fscal,dx12);
2489 ty = _mm256_mul_ps(fscal,dy12);
2490 tz = _mm256_mul_ps(fscal,dz12);
2492 /* Update vectorial force */
2493 fix1 = _mm256_add_ps(fix1,tx);
2494 fiy1 = _mm256_add_ps(fiy1,ty);
2495 fiz1 = _mm256_add_ps(fiz1,tz);
2497 fjx2 = _mm256_add_ps(fjx2,tx);
2498 fjy2 = _mm256_add_ps(fjy2,ty);
2499 fjz2 = _mm256_add_ps(fjz2,tz);
2501 /**************************
2502 * CALCULATE INTERACTIONS *
2503 **************************/
2505 r20 = _mm256_mul_ps(rsq20,rinv20);
2506 r20 = _mm256_andnot_ps(dummy_mask,r20);
2508 /* Calculate table index by multiplying r with table scale and truncate to integer */
2509 rt = _mm256_mul_ps(r20,vftabscale);
2510 vfitab = _mm256_cvttps_epi32(rt);
2511 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2512 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2513 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2514 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2515 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2516 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2518 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2519 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2520 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2521 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2522 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2523 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2524 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2525 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2526 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2527 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2528 Heps = _mm256_mul_ps(vfeps,H);
2529 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2530 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2531 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2535 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2537 /* Calculate temporary vectorial force */
2538 tx = _mm256_mul_ps(fscal,dx20);
2539 ty = _mm256_mul_ps(fscal,dy20);
2540 tz = _mm256_mul_ps(fscal,dz20);
2542 /* Update vectorial force */
2543 fix2 = _mm256_add_ps(fix2,tx);
2544 fiy2 = _mm256_add_ps(fiy2,ty);
2545 fiz2 = _mm256_add_ps(fiz2,tz);
2547 fjx0 = _mm256_add_ps(fjx0,tx);
2548 fjy0 = _mm256_add_ps(fjy0,ty);
2549 fjz0 = _mm256_add_ps(fjz0,tz);
2551 /**************************
2552 * CALCULATE INTERACTIONS *
2553 **************************/
2555 r21 = _mm256_mul_ps(rsq21,rinv21);
2556 r21 = _mm256_andnot_ps(dummy_mask,r21);
2558 /* Calculate table index by multiplying r with table scale and truncate to integer */
2559 rt = _mm256_mul_ps(r21,vftabscale);
2560 vfitab = _mm256_cvttps_epi32(rt);
2561 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2562 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2563 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2564 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2565 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2566 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2568 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2569 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2570 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2571 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2572 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2573 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2574 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2575 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2576 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2577 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2578 Heps = _mm256_mul_ps(vfeps,H);
2579 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2580 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2581 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2585 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2587 /* Calculate temporary vectorial force */
2588 tx = _mm256_mul_ps(fscal,dx21);
2589 ty = _mm256_mul_ps(fscal,dy21);
2590 tz = _mm256_mul_ps(fscal,dz21);
2592 /* Update vectorial force */
2593 fix2 = _mm256_add_ps(fix2,tx);
2594 fiy2 = _mm256_add_ps(fiy2,ty);
2595 fiz2 = _mm256_add_ps(fiz2,tz);
2597 fjx1 = _mm256_add_ps(fjx1,tx);
2598 fjy1 = _mm256_add_ps(fjy1,ty);
2599 fjz1 = _mm256_add_ps(fjz1,tz);
2601 /**************************
2602 * CALCULATE INTERACTIONS *
2603 **************************/
2605 r22 = _mm256_mul_ps(rsq22,rinv22);
2606 r22 = _mm256_andnot_ps(dummy_mask,r22);
2608 /* Calculate table index by multiplying r with table scale and truncate to integer */
2609 rt = _mm256_mul_ps(r22,vftabscale);
2610 vfitab = _mm256_cvttps_epi32(rt);
2611 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2612 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2613 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2614 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2615 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2616 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2618 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2619 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2620 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2621 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2622 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2623 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2624 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2625 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2626 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2627 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2628 Heps = _mm256_mul_ps(vfeps,H);
2629 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2630 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2631 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2635 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2637 /* Calculate temporary vectorial force */
2638 tx = _mm256_mul_ps(fscal,dx22);
2639 ty = _mm256_mul_ps(fscal,dy22);
2640 tz = _mm256_mul_ps(fscal,dz22);
2642 /* Update vectorial force */
2643 fix2 = _mm256_add_ps(fix2,tx);
2644 fiy2 = _mm256_add_ps(fiy2,ty);
2645 fiz2 = _mm256_add_ps(fiz2,tz);
2647 fjx2 = _mm256_add_ps(fjx2,tx);
2648 fjy2 = _mm256_add_ps(fjy2,ty);
2649 fjz2 = _mm256_add_ps(fjz2,tz);
2651 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2652 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2653 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2654 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2655 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2656 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2657 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2658 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2660 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2661 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2663 /* Inner loop uses 360 flops */
2666 /* End of innermost loop */
2668 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2669 f+i_coord_offset,fshift+i_shift_offset);
2671 /* Increment number of inner iterations */
2672 inneriter += j_index_end - j_index_start;
2674 /* Outer loop uses 18 flops */
2677 /* Increment number of outer iterations */
2680 /* Update outer/inner flops */
2682 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_W3W3_F,outeriter*18 + inneriter*360);