2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/legacyheaders/types/simple.h"
46 #include "gromacs/math/vec.h"
47 #include "gromacs/legacyheaders/nrnb.h"
49 #include "gromacs/simd/math_x86_avx_256_single.h"
50 #include "kernelutil_x86_avx_256_single.h"
53 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_single
54 * Electrostatics interaction: CubicSplineTable
55 * VdW interaction: CubicSplineTable
56 * Geometry: Water4-Water4
57 * Calculate force/pot: PotentialAndForce
60 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_VF_avx_256_single
61 (t_nblist * gmx_restrict nlist,
62 rvec * gmx_restrict xx,
63 rvec * gmx_restrict ff,
64 t_forcerec * gmx_restrict fr,
65 t_mdatoms * gmx_restrict mdatoms,
66 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
67 t_nrnb * gmx_restrict nrnb)
69 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
70 * just 0 for non-waters.
71 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
72 * jnr indices corresponding to data put in the four positions in the SIMD register.
74 int i_shift_offset,i_coord_offset,outeriter,inneriter;
75 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
76 int jnrA,jnrB,jnrC,jnrD;
77 int jnrE,jnrF,jnrG,jnrH;
78 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
79 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
80 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
81 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
82 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
84 real *shiftvec,*fshift,*x,*f;
85 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
87 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
88 real * vdwioffsetptr0;
89 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
90 real * vdwioffsetptr1;
91 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
92 real * vdwioffsetptr2;
93 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
94 real * vdwioffsetptr3;
95 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
96 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
97 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
98 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
99 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
100 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
101 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
102 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
103 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
104 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
105 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
106 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
107 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
108 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
109 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
110 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
111 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
112 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
113 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
114 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
117 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
120 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
121 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
123 __m128i vfitab_lo,vfitab_hi;
124 __m128i ifour = _mm_set1_epi32(4);
125 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
127 __m256 dummy_mask,cutoff_mask;
128 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
129 __m256 one = _mm256_set1_ps(1.0);
130 __m256 two = _mm256_set1_ps(2.0);
136 jindex = nlist->jindex;
138 shiftidx = nlist->shift;
140 shiftvec = fr->shift_vec[0];
141 fshift = fr->fshift[0];
142 facel = _mm256_set1_ps(fr->epsfac);
143 charge = mdatoms->chargeA;
144 nvdwtype = fr->ntype;
146 vdwtype = mdatoms->typeA;
148 vftab = kernel_data->table_elec_vdw->data;
149 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
151 /* Setup water-specific parameters */
152 inr = nlist->iinr[0];
153 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
154 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
155 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
156 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
158 jq1 = _mm256_set1_ps(charge[inr+1]);
159 jq2 = _mm256_set1_ps(charge[inr+2]);
160 jq3 = _mm256_set1_ps(charge[inr+3]);
161 vdwjidx0A = 2*vdwtype[inr+0];
162 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
163 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
164 qq11 = _mm256_mul_ps(iq1,jq1);
165 qq12 = _mm256_mul_ps(iq1,jq2);
166 qq13 = _mm256_mul_ps(iq1,jq3);
167 qq21 = _mm256_mul_ps(iq2,jq1);
168 qq22 = _mm256_mul_ps(iq2,jq2);
169 qq23 = _mm256_mul_ps(iq2,jq3);
170 qq31 = _mm256_mul_ps(iq3,jq1);
171 qq32 = _mm256_mul_ps(iq3,jq2);
172 qq33 = _mm256_mul_ps(iq3,jq3);
174 /* Avoid stupid compiler warnings */
175 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
188 for(iidx=0;iidx<4*DIM;iidx++)
193 /* Start outer loop over neighborlists */
194 for(iidx=0; iidx<nri; iidx++)
196 /* Load shift vector for this list */
197 i_shift_offset = DIM*shiftidx[iidx];
199 /* Load limits for loop over neighbors */
200 j_index_start = jindex[iidx];
201 j_index_end = jindex[iidx+1];
203 /* Get outer coordinate index */
205 i_coord_offset = DIM*inr;
207 /* Load i particle coords and add shift vector */
208 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
209 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
211 fix0 = _mm256_setzero_ps();
212 fiy0 = _mm256_setzero_ps();
213 fiz0 = _mm256_setzero_ps();
214 fix1 = _mm256_setzero_ps();
215 fiy1 = _mm256_setzero_ps();
216 fiz1 = _mm256_setzero_ps();
217 fix2 = _mm256_setzero_ps();
218 fiy2 = _mm256_setzero_ps();
219 fiz2 = _mm256_setzero_ps();
220 fix3 = _mm256_setzero_ps();
221 fiy3 = _mm256_setzero_ps();
222 fiz3 = _mm256_setzero_ps();
224 /* Reset potential sums */
225 velecsum = _mm256_setzero_ps();
226 vvdwsum = _mm256_setzero_ps();
228 /* Start inner kernel loop */
229 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
232 /* Get j neighbor index, and coordinate index */
241 j_coord_offsetA = DIM*jnrA;
242 j_coord_offsetB = DIM*jnrB;
243 j_coord_offsetC = DIM*jnrC;
244 j_coord_offsetD = DIM*jnrD;
245 j_coord_offsetE = DIM*jnrE;
246 j_coord_offsetF = DIM*jnrF;
247 j_coord_offsetG = DIM*jnrG;
248 j_coord_offsetH = DIM*jnrH;
250 /* load j atom coordinates */
251 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
252 x+j_coord_offsetC,x+j_coord_offsetD,
253 x+j_coord_offsetE,x+j_coord_offsetF,
254 x+j_coord_offsetG,x+j_coord_offsetH,
255 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
256 &jy2,&jz2,&jx3,&jy3,&jz3);
258 /* Calculate displacement vector */
259 dx00 = _mm256_sub_ps(ix0,jx0);
260 dy00 = _mm256_sub_ps(iy0,jy0);
261 dz00 = _mm256_sub_ps(iz0,jz0);
262 dx11 = _mm256_sub_ps(ix1,jx1);
263 dy11 = _mm256_sub_ps(iy1,jy1);
264 dz11 = _mm256_sub_ps(iz1,jz1);
265 dx12 = _mm256_sub_ps(ix1,jx2);
266 dy12 = _mm256_sub_ps(iy1,jy2);
267 dz12 = _mm256_sub_ps(iz1,jz2);
268 dx13 = _mm256_sub_ps(ix1,jx3);
269 dy13 = _mm256_sub_ps(iy1,jy3);
270 dz13 = _mm256_sub_ps(iz1,jz3);
271 dx21 = _mm256_sub_ps(ix2,jx1);
272 dy21 = _mm256_sub_ps(iy2,jy1);
273 dz21 = _mm256_sub_ps(iz2,jz1);
274 dx22 = _mm256_sub_ps(ix2,jx2);
275 dy22 = _mm256_sub_ps(iy2,jy2);
276 dz22 = _mm256_sub_ps(iz2,jz2);
277 dx23 = _mm256_sub_ps(ix2,jx3);
278 dy23 = _mm256_sub_ps(iy2,jy3);
279 dz23 = _mm256_sub_ps(iz2,jz3);
280 dx31 = _mm256_sub_ps(ix3,jx1);
281 dy31 = _mm256_sub_ps(iy3,jy1);
282 dz31 = _mm256_sub_ps(iz3,jz1);
283 dx32 = _mm256_sub_ps(ix3,jx2);
284 dy32 = _mm256_sub_ps(iy3,jy2);
285 dz32 = _mm256_sub_ps(iz3,jz2);
286 dx33 = _mm256_sub_ps(ix3,jx3);
287 dy33 = _mm256_sub_ps(iy3,jy3);
288 dz33 = _mm256_sub_ps(iz3,jz3);
290 /* Calculate squared distance and things based on it */
291 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
292 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
293 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
294 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
295 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
296 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
297 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
298 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
299 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
300 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
302 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
303 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
304 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
305 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
306 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
307 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
308 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
309 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
310 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
311 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
313 fjx0 = _mm256_setzero_ps();
314 fjy0 = _mm256_setzero_ps();
315 fjz0 = _mm256_setzero_ps();
316 fjx1 = _mm256_setzero_ps();
317 fjy1 = _mm256_setzero_ps();
318 fjz1 = _mm256_setzero_ps();
319 fjx2 = _mm256_setzero_ps();
320 fjy2 = _mm256_setzero_ps();
321 fjz2 = _mm256_setzero_ps();
322 fjx3 = _mm256_setzero_ps();
323 fjy3 = _mm256_setzero_ps();
324 fjz3 = _mm256_setzero_ps();
326 /**************************
327 * CALCULATE INTERACTIONS *
328 **************************/
330 r00 = _mm256_mul_ps(rsq00,rinv00);
332 /* Calculate table index by multiplying r with table scale and truncate to integer */
333 rt = _mm256_mul_ps(r00,vftabscale);
334 vfitab = _mm256_cvttps_epi32(rt);
335 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
336 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
337 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
338 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
339 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
340 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
342 /* CUBIC SPLINE TABLE DISPERSION */
343 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
344 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
345 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
346 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
347 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
348 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
349 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
350 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
351 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
352 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
353 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
354 Heps = _mm256_mul_ps(vfeps,H);
355 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
356 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
357 vvdw6 = _mm256_mul_ps(c6_00,VV);
358 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
359 fvdw6 = _mm256_mul_ps(c6_00,FF);
361 /* CUBIC SPLINE TABLE REPULSION */
362 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
363 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
364 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
365 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
366 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
367 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
368 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
369 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
370 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
371 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
372 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
373 Heps = _mm256_mul_ps(vfeps,H);
374 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
375 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
376 vvdw12 = _mm256_mul_ps(c12_00,VV);
377 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
378 fvdw12 = _mm256_mul_ps(c12_00,FF);
379 vvdw = _mm256_add_ps(vvdw12,vvdw6);
380 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
382 /* Update potential sum for this i atom from the interaction with this j atom. */
383 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
387 /* Calculate temporary vectorial force */
388 tx = _mm256_mul_ps(fscal,dx00);
389 ty = _mm256_mul_ps(fscal,dy00);
390 tz = _mm256_mul_ps(fscal,dz00);
392 /* Update vectorial force */
393 fix0 = _mm256_add_ps(fix0,tx);
394 fiy0 = _mm256_add_ps(fiy0,ty);
395 fiz0 = _mm256_add_ps(fiz0,tz);
397 fjx0 = _mm256_add_ps(fjx0,tx);
398 fjy0 = _mm256_add_ps(fjy0,ty);
399 fjz0 = _mm256_add_ps(fjz0,tz);
401 /**************************
402 * CALCULATE INTERACTIONS *
403 **************************/
405 r11 = _mm256_mul_ps(rsq11,rinv11);
407 /* Calculate table index by multiplying r with table scale and truncate to integer */
408 rt = _mm256_mul_ps(r11,vftabscale);
409 vfitab = _mm256_cvttps_epi32(rt);
410 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
411 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
412 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
413 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
414 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
415 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
417 /* CUBIC SPLINE TABLE ELECTROSTATICS */
418 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
419 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
420 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
421 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
422 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
423 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
424 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
425 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
426 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
427 Heps = _mm256_mul_ps(vfeps,H);
428 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
429 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
430 velec = _mm256_mul_ps(qq11,VV);
431 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
432 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
434 /* Update potential sum for this i atom from the interaction with this j atom. */
435 velecsum = _mm256_add_ps(velecsum,velec);
439 /* Calculate temporary vectorial force */
440 tx = _mm256_mul_ps(fscal,dx11);
441 ty = _mm256_mul_ps(fscal,dy11);
442 tz = _mm256_mul_ps(fscal,dz11);
444 /* Update vectorial force */
445 fix1 = _mm256_add_ps(fix1,tx);
446 fiy1 = _mm256_add_ps(fiy1,ty);
447 fiz1 = _mm256_add_ps(fiz1,tz);
449 fjx1 = _mm256_add_ps(fjx1,tx);
450 fjy1 = _mm256_add_ps(fjy1,ty);
451 fjz1 = _mm256_add_ps(fjz1,tz);
453 /**************************
454 * CALCULATE INTERACTIONS *
455 **************************/
457 r12 = _mm256_mul_ps(rsq12,rinv12);
459 /* Calculate table index by multiplying r with table scale and truncate to integer */
460 rt = _mm256_mul_ps(r12,vftabscale);
461 vfitab = _mm256_cvttps_epi32(rt);
462 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
463 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
464 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
465 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
466 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
467 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
469 /* CUBIC SPLINE TABLE ELECTROSTATICS */
470 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
471 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
472 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
473 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
474 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
475 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
476 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
477 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
478 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
479 Heps = _mm256_mul_ps(vfeps,H);
480 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
481 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
482 velec = _mm256_mul_ps(qq12,VV);
483 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
484 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
486 /* Update potential sum for this i atom from the interaction with this j atom. */
487 velecsum = _mm256_add_ps(velecsum,velec);
491 /* Calculate temporary vectorial force */
492 tx = _mm256_mul_ps(fscal,dx12);
493 ty = _mm256_mul_ps(fscal,dy12);
494 tz = _mm256_mul_ps(fscal,dz12);
496 /* Update vectorial force */
497 fix1 = _mm256_add_ps(fix1,tx);
498 fiy1 = _mm256_add_ps(fiy1,ty);
499 fiz1 = _mm256_add_ps(fiz1,tz);
501 fjx2 = _mm256_add_ps(fjx2,tx);
502 fjy2 = _mm256_add_ps(fjy2,ty);
503 fjz2 = _mm256_add_ps(fjz2,tz);
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 r13 = _mm256_mul_ps(rsq13,rinv13);
511 /* Calculate table index by multiplying r with table scale and truncate to integer */
512 rt = _mm256_mul_ps(r13,vftabscale);
513 vfitab = _mm256_cvttps_epi32(rt);
514 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
515 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
516 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
517 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
518 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
519 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
521 /* CUBIC SPLINE TABLE ELECTROSTATICS */
522 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
523 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
524 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
525 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
526 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
527 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
528 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
529 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
530 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
531 Heps = _mm256_mul_ps(vfeps,H);
532 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
533 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
534 velec = _mm256_mul_ps(qq13,VV);
535 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
536 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
538 /* Update potential sum for this i atom from the interaction with this j atom. */
539 velecsum = _mm256_add_ps(velecsum,velec);
543 /* Calculate temporary vectorial force */
544 tx = _mm256_mul_ps(fscal,dx13);
545 ty = _mm256_mul_ps(fscal,dy13);
546 tz = _mm256_mul_ps(fscal,dz13);
548 /* Update vectorial force */
549 fix1 = _mm256_add_ps(fix1,tx);
550 fiy1 = _mm256_add_ps(fiy1,ty);
551 fiz1 = _mm256_add_ps(fiz1,tz);
553 fjx3 = _mm256_add_ps(fjx3,tx);
554 fjy3 = _mm256_add_ps(fjy3,ty);
555 fjz3 = _mm256_add_ps(fjz3,tz);
557 /**************************
558 * CALCULATE INTERACTIONS *
559 **************************/
561 r21 = _mm256_mul_ps(rsq21,rinv21);
563 /* Calculate table index by multiplying r with table scale and truncate to integer */
564 rt = _mm256_mul_ps(r21,vftabscale);
565 vfitab = _mm256_cvttps_epi32(rt);
566 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
567 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
568 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
569 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
570 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
571 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
573 /* CUBIC SPLINE TABLE ELECTROSTATICS */
574 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
575 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
576 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
577 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
578 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
579 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
580 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
581 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
582 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
583 Heps = _mm256_mul_ps(vfeps,H);
584 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
585 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
586 velec = _mm256_mul_ps(qq21,VV);
587 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
588 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
590 /* Update potential sum for this i atom from the interaction with this j atom. */
591 velecsum = _mm256_add_ps(velecsum,velec);
595 /* Calculate temporary vectorial force */
596 tx = _mm256_mul_ps(fscal,dx21);
597 ty = _mm256_mul_ps(fscal,dy21);
598 tz = _mm256_mul_ps(fscal,dz21);
600 /* Update vectorial force */
601 fix2 = _mm256_add_ps(fix2,tx);
602 fiy2 = _mm256_add_ps(fiy2,ty);
603 fiz2 = _mm256_add_ps(fiz2,tz);
605 fjx1 = _mm256_add_ps(fjx1,tx);
606 fjy1 = _mm256_add_ps(fjy1,ty);
607 fjz1 = _mm256_add_ps(fjz1,tz);
609 /**************************
610 * CALCULATE INTERACTIONS *
611 **************************/
613 r22 = _mm256_mul_ps(rsq22,rinv22);
615 /* Calculate table index by multiplying r with table scale and truncate to integer */
616 rt = _mm256_mul_ps(r22,vftabscale);
617 vfitab = _mm256_cvttps_epi32(rt);
618 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
619 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
620 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
621 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
622 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
623 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
625 /* CUBIC SPLINE TABLE ELECTROSTATICS */
626 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
627 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
628 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
629 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
630 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
631 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
632 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
633 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
634 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
635 Heps = _mm256_mul_ps(vfeps,H);
636 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
637 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
638 velec = _mm256_mul_ps(qq22,VV);
639 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
640 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
642 /* Update potential sum for this i atom from the interaction with this j atom. */
643 velecsum = _mm256_add_ps(velecsum,velec);
647 /* Calculate temporary vectorial force */
648 tx = _mm256_mul_ps(fscal,dx22);
649 ty = _mm256_mul_ps(fscal,dy22);
650 tz = _mm256_mul_ps(fscal,dz22);
652 /* Update vectorial force */
653 fix2 = _mm256_add_ps(fix2,tx);
654 fiy2 = _mm256_add_ps(fiy2,ty);
655 fiz2 = _mm256_add_ps(fiz2,tz);
657 fjx2 = _mm256_add_ps(fjx2,tx);
658 fjy2 = _mm256_add_ps(fjy2,ty);
659 fjz2 = _mm256_add_ps(fjz2,tz);
661 /**************************
662 * CALCULATE INTERACTIONS *
663 **************************/
665 r23 = _mm256_mul_ps(rsq23,rinv23);
667 /* Calculate table index by multiplying r with table scale and truncate to integer */
668 rt = _mm256_mul_ps(r23,vftabscale);
669 vfitab = _mm256_cvttps_epi32(rt);
670 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
671 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
672 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
673 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
674 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
675 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
677 /* CUBIC SPLINE TABLE ELECTROSTATICS */
678 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
679 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
680 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
681 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
682 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
683 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
684 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
685 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
686 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
687 Heps = _mm256_mul_ps(vfeps,H);
688 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
689 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
690 velec = _mm256_mul_ps(qq23,VV);
691 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
692 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
694 /* Update potential sum for this i atom from the interaction with this j atom. */
695 velecsum = _mm256_add_ps(velecsum,velec);
699 /* Calculate temporary vectorial force */
700 tx = _mm256_mul_ps(fscal,dx23);
701 ty = _mm256_mul_ps(fscal,dy23);
702 tz = _mm256_mul_ps(fscal,dz23);
704 /* Update vectorial force */
705 fix2 = _mm256_add_ps(fix2,tx);
706 fiy2 = _mm256_add_ps(fiy2,ty);
707 fiz2 = _mm256_add_ps(fiz2,tz);
709 fjx3 = _mm256_add_ps(fjx3,tx);
710 fjy3 = _mm256_add_ps(fjy3,ty);
711 fjz3 = _mm256_add_ps(fjz3,tz);
713 /**************************
714 * CALCULATE INTERACTIONS *
715 **************************/
717 r31 = _mm256_mul_ps(rsq31,rinv31);
719 /* Calculate table index by multiplying r with table scale and truncate to integer */
720 rt = _mm256_mul_ps(r31,vftabscale);
721 vfitab = _mm256_cvttps_epi32(rt);
722 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
723 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
724 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
725 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
726 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
727 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
729 /* CUBIC SPLINE TABLE ELECTROSTATICS */
730 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
731 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
732 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
733 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
734 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
735 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
736 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
737 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
738 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
739 Heps = _mm256_mul_ps(vfeps,H);
740 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
741 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
742 velec = _mm256_mul_ps(qq31,VV);
743 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
744 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
746 /* Update potential sum for this i atom from the interaction with this j atom. */
747 velecsum = _mm256_add_ps(velecsum,velec);
751 /* Calculate temporary vectorial force */
752 tx = _mm256_mul_ps(fscal,dx31);
753 ty = _mm256_mul_ps(fscal,dy31);
754 tz = _mm256_mul_ps(fscal,dz31);
756 /* Update vectorial force */
757 fix3 = _mm256_add_ps(fix3,tx);
758 fiy3 = _mm256_add_ps(fiy3,ty);
759 fiz3 = _mm256_add_ps(fiz3,tz);
761 fjx1 = _mm256_add_ps(fjx1,tx);
762 fjy1 = _mm256_add_ps(fjy1,ty);
763 fjz1 = _mm256_add_ps(fjz1,tz);
765 /**************************
766 * CALCULATE INTERACTIONS *
767 **************************/
769 r32 = _mm256_mul_ps(rsq32,rinv32);
771 /* Calculate table index by multiplying r with table scale and truncate to integer */
772 rt = _mm256_mul_ps(r32,vftabscale);
773 vfitab = _mm256_cvttps_epi32(rt);
774 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
775 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
776 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
777 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
778 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
779 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
781 /* CUBIC SPLINE TABLE ELECTROSTATICS */
782 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
783 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
784 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
785 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
786 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
787 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
788 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
789 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
790 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
791 Heps = _mm256_mul_ps(vfeps,H);
792 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
793 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
794 velec = _mm256_mul_ps(qq32,VV);
795 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
796 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
798 /* Update potential sum for this i atom from the interaction with this j atom. */
799 velecsum = _mm256_add_ps(velecsum,velec);
803 /* Calculate temporary vectorial force */
804 tx = _mm256_mul_ps(fscal,dx32);
805 ty = _mm256_mul_ps(fscal,dy32);
806 tz = _mm256_mul_ps(fscal,dz32);
808 /* Update vectorial force */
809 fix3 = _mm256_add_ps(fix3,tx);
810 fiy3 = _mm256_add_ps(fiy3,ty);
811 fiz3 = _mm256_add_ps(fiz3,tz);
813 fjx2 = _mm256_add_ps(fjx2,tx);
814 fjy2 = _mm256_add_ps(fjy2,ty);
815 fjz2 = _mm256_add_ps(fjz2,tz);
817 /**************************
818 * CALCULATE INTERACTIONS *
819 **************************/
821 r33 = _mm256_mul_ps(rsq33,rinv33);
823 /* Calculate table index by multiplying r with table scale and truncate to integer */
824 rt = _mm256_mul_ps(r33,vftabscale);
825 vfitab = _mm256_cvttps_epi32(rt);
826 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
827 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
828 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
829 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
830 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
831 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
833 /* CUBIC SPLINE TABLE ELECTROSTATICS */
834 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
835 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
836 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
837 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
838 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
839 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
840 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
841 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
842 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
843 Heps = _mm256_mul_ps(vfeps,H);
844 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
845 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
846 velec = _mm256_mul_ps(qq33,VV);
847 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
848 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
850 /* Update potential sum for this i atom from the interaction with this j atom. */
851 velecsum = _mm256_add_ps(velecsum,velec);
855 /* Calculate temporary vectorial force */
856 tx = _mm256_mul_ps(fscal,dx33);
857 ty = _mm256_mul_ps(fscal,dy33);
858 tz = _mm256_mul_ps(fscal,dz33);
860 /* Update vectorial force */
861 fix3 = _mm256_add_ps(fix3,tx);
862 fiy3 = _mm256_add_ps(fiy3,ty);
863 fiz3 = _mm256_add_ps(fiz3,tz);
865 fjx3 = _mm256_add_ps(fjx3,tx);
866 fjy3 = _mm256_add_ps(fjy3,ty);
867 fjz3 = _mm256_add_ps(fjz3,tz);
869 fjptrA = f+j_coord_offsetA;
870 fjptrB = f+j_coord_offsetB;
871 fjptrC = f+j_coord_offsetC;
872 fjptrD = f+j_coord_offsetD;
873 fjptrE = f+j_coord_offsetE;
874 fjptrF = f+j_coord_offsetF;
875 fjptrG = f+j_coord_offsetG;
876 fjptrH = f+j_coord_offsetH;
878 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
879 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
880 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
882 /* Inner loop uses 446 flops */
888 /* Get j neighbor index, and coordinate index */
889 jnrlistA = jjnr[jidx];
890 jnrlistB = jjnr[jidx+1];
891 jnrlistC = jjnr[jidx+2];
892 jnrlistD = jjnr[jidx+3];
893 jnrlistE = jjnr[jidx+4];
894 jnrlistF = jjnr[jidx+5];
895 jnrlistG = jjnr[jidx+6];
896 jnrlistH = jjnr[jidx+7];
897 /* Sign of each element will be negative for non-real atoms.
898 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
899 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
901 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
902 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
904 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
905 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
906 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
907 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
908 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
909 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
910 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
911 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
912 j_coord_offsetA = DIM*jnrA;
913 j_coord_offsetB = DIM*jnrB;
914 j_coord_offsetC = DIM*jnrC;
915 j_coord_offsetD = DIM*jnrD;
916 j_coord_offsetE = DIM*jnrE;
917 j_coord_offsetF = DIM*jnrF;
918 j_coord_offsetG = DIM*jnrG;
919 j_coord_offsetH = DIM*jnrH;
921 /* load j atom coordinates */
922 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
923 x+j_coord_offsetC,x+j_coord_offsetD,
924 x+j_coord_offsetE,x+j_coord_offsetF,
925 x+j_coord_offsetG,x+j_coord_offsetH,
926 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
927 &jy2,&jz2,&jx3,&jy3,&jz3);
929 /* Calculate displacement vector */
930 dx00 = _mm256_sub_ps(ix0,jx0);
931 dy00 = _mm256_sub_ps(iy0,jy0);
932 dz00 = _mm256_sub_ps(iz0,jz0);
933 dx11 = _mm256_sub_ps(ix1,jx1);
934 dy11 = _mm256_sub_ps(iy1,jy1);
935 dz11 = _mm256_sub_ps(iz1,jz1);
936 dx12 = _mm256_sub_ps(ix1,jx2);
937 dy12 = _mm256_sub_ps(iy1,jy2);
938 dz12 = _mm256_sub_ps(iz1,jz2);
939 dx13 = _mm256_sub_ps(ix1,jx3);
940 dy13 = _mm256_sub_ps(iy1,jy3);
941 dz13 = _mm256_sub_ps(iz1,jz3);
942 dx21 = _mm256_sub_ps(ix2,jx1);
943 dy21 = _mm256_sub_ps(iy2,jy1);
944 dz21 = _mm256_sub_ps(iz2,jz1);
945 dx22 = _mm256_sub_ps(ix2,jx2);
946 dy22 = _mm256_sub_ps(iy2,jy2);
947 dz22 = _mm256_sub_ps(iz2,jz2);
948 dx23 = _mm256_sub_ps(ix2,jx3);
949 dy23 = _mm256_sub_ps(iy2,jy3);
950 dz23 = _mm256_sub_ps(iz2,jz3);
951 dx31 = _mm256_sub_ps(ix3,jx1);
952 dy31 = _mm256_sub_ps(iy3,jy1);
953 dz31 = _mm256_sub_ps(iz3,jz1);
954 dx32 = _mm256_sub_ps(ix3,jx2);
955 dy32 = _mm256_sub_ps(iy3,jy2);
956 dz32 = _mm256_sub_ps(iz3,jz2);
957 dx33 = _mm256_sub_ps(ix3,jx3);
958 dy33 = _mm256_sub_ps(iy3,jy3);
959 dz33 = _mm256_sub_ps(iz3,jz3);
961 /* Calculate squared distance and things based on it */
962 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
963 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
964 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
965 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
966 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
967 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
968 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
969 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
970 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
971 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
973 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
974 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
975 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
976 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
977 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
978 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
979 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
980 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
981 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
982 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
984 fjx0 = _mm256_setzero_ps();
985 fjy0 = _mm256_setzero_ps();
986 fjz0 = _mm256_setzero_ps();
987 fjx1 = _mm256_setzero_ps();
988 fjy1 = _mm256_setzero_ps();
989 fjz1 = _mm256_setzero_ps();
990 fjx2 = _mm256_setzero_ps();
991 fjy2 = _mm256_setzero_ps();
992 fjz2 = _mm256_setzero_ps();
993 fjx3 = _mm256_setzero_ps();
994 fjy3 = _mm256_setzero_ps();
995 fjz3 = _mm256_setzero_ps();
997 /**************************
998 * CALCULATE INTERACTIONS *
999 **************************/
1001 r00 = _mm256_mul_ps(rsq00,rinv00);
1002 r00 = _mm256_andnot_ps(dummy_mask,r00);
1004 /* Calculate table index by multiplying r with table scale and truncate to integer */
1005 rt = _mm256_mul_ps(r00,vftabscale);
1006 vfitab = _mm256_cvttps_epi32(rt);
1007 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1008 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1009 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1010 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1011 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1012 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1014 /* CUBIC SPLINE TABLE DISPERSION */
1015 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1016 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1017 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1018 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1019 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1020 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1021 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1022 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1023 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1024 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1025 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1026 Heps = _mm256_mul_ps(vfeps,H);
1027 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1028 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1029 vvdw6 = _mm256_mul_ps(c6_00,VV);
1030 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1031 fvdw6 = _mm256_mul_ps(c6_00,FF);
1033 /* CUBIC SPLINE TABLE REPULSION */
1034 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1035 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1036 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1037 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1038 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1039 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1040 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1041 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1042 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1043 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1044 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1045 Heps = _mm256_mul_ps(vfeps,H);
1046 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1047 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1048 vvdw12 = _mm256_mul_ps(c12_00,VV);
1049 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1050 fvdw12 = _mm256_mul_ps(c12_00,FF);
1051 vvdw = _mm256_add_ps(vvdw12,vvdw6);
1052 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1054 /* Update potential sum for this i atom from the interaction with this j atom. */
1055 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
1056 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
1060 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1062 /* Calculate temporary vectorial force */
1063 tx = _mm256_mul_ps(fscal,dx00);
1064 ty = _mm256_mul_ps(fscal,dy00);
1065 tz = _mm256_mul_ps(fscal,dz00);
1067 /* Update vectorial force */
1068 fix0 = _mm256_add_ps(fix0,tx);
1069 fiy0 = _mm256_add_ps(fiy0,ty);
1070 fiz0 = _mm256_add_ps(fiz0,tz);
1072 fjx0 = _mm256_add_ps(fjx0,tx);
1073 fjy0 = _mm256_add_ps(fjy0,ty);
1074 fjz0 = _mm256_add_ps(fjz0,tz);
1076 /**************************
1077 * CALCULATE INTERACTIONS *
1078 **************************/
1080 r11 = _mm256_mul_ps(rsq11,rinv11);
1081 r11 = _mm256_andnot_ps(dummy_mask,r11);
1083 /* Calculate table index by multiplying r with table scale and truncate to integer */
1084 rt = _mm256_mul_ps(r11,vftabscale);
1085 vfitab = _mm256_cvttps_epi32(rt);
1086 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1087 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1088 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1089 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1090 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1091 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1093 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1094 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1095 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1096 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1097 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1098 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1099 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1100 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1101 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1102 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1103 Heps = _mm256_mul_ps(vfeps,H);
1104 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1105 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1106 velec = _mm256_mul_ps(qq11,VV);
1107 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1108 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1110 /* Update potential sum for this i atom from the interaction with this j atom. */
1111 velec = _mm256_andnot_ps(dummy_mask,velec);
1112 velecsum = _mm256_add_ps(velecsum,velec);
1116 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1118 /* Calculate temporary vectorial force */
1119 tx = _mm256_mul_ps(fscal,dx11);
1120 ty = _mm256_mul_ps(fscal,dy11);
1121 tz = _mm256_mul_ps(fscal,dz11);
1123 /* Update vectorial force */
1124 fix1 = _mm256_add_ps(fix1,tx);
1125 fiy1 = _mm256_add_ps(fiy1,ty);
1126 fiz1 = _mm256_add_ps(fiz1,tz);
1128 fjx1 = _mm256_add_ps(fjx1,tx);
1129 fjy1 = _mm256_add_ps(fjy1,ty);
1130 fjz1 = _mm256_add_ps(fjz1,tz);
1132 /**************************
1133 * CALCULATE INTERACTIONS *
1134 **************************/
1136 r12 = _mm256_mul_ps(rsq12,rinv12);
1137 r12 = _mm256_andnot_ps(dummy_mask,r12);
1139 /* Calculate table index by multiplying r with table scale and truncate to integer */
1140 rt = _mm256_mul_ps(r12,vftabscale);
1141 vfitab = _mm256_cvttps_epi32(rt);
1142 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1143 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1144 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1145 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1146 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1147 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1149 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1150 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1151 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1152 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1153 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1154 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1155 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1156 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1157 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1158 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1159 Heps = _mm256_mul_ps(vfeps,H);
1160 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1161 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1162 velec = _mm256_mul_ps(qq12,VV);
1163 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1164 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1166 /* Update potential sum for this i atom from the interaction with this j atom. */
1167 velec = _mm256_andnot_ps(dummy_mask,velec);
1168 velecsum = _mm256_add_ps(velecsum,velec);
1172 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1174 /* Calculate temporary vectorial force */
1175 tx = _mm256_mul_ps(fscal,dx12);
1176 ty = _mm256_mul_ps(fscal,dy12);
1177 tz = _mm256_mul_ps(fscal,dz12);
1179 /* Update vectorial force */
1180 fix1 = _mm256_add_ps(fix1,tx);
1181 fiy1 = _mm256_add_ps(fiy1,ty);
1182 fiz1 = _mm256_add_ps(fiz1,tz);
1184 fjx2 = _mm256_add_ps(fjx2,tx);
1185 fjy2 = _mm256_add_ps(fjy2,ty);
1186 fjz2 = _mm256_add_ps(fjz2,tz);
1188 /**************************
1189 * CALCULATE INTERACTIONS *
1190 **************************/
1192 r13 = _mm256_mul_ps(rsq13,rinv13);
1193 r13 = _mm256_andnot_ps(dummy_mask,r13);
1195 /* Calculate table index by multiplying r with table scale and truncate to integer */
1196 rt = _mm256_mul_ps(r13,vftabscale);
1197 vfitab = _mm256_cvttps_epi32(rt);
1198 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1199 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1200 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1201 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1202 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1203 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1205 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1206 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1207 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1208 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1209 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1210 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1211 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1212 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1213 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1214 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1215 Heps = _mm256_mul_ps(vfeps,H);
1216 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1217 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1218 velec = _mm256_mul_ps(qq13,VV);
1219 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1220 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
1222 /* Update potential sum for this i atom from the interaction with this j atom. */
1223 velec = _mm256_andnot_ps(dummy_mask,velec);
1224 velecsum = _mm256_add_ps(velecsum,velec);
1228 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1230 /* Calculate temporary vectorial force */
1231 tx = _mm256_mul_ps(fscal,dx13);
1232 ty = _mm256_mul_ps(fscal,dy13);
1233 tz = _mm256_mul_ps(fscal,dz13);
1235 /* Update vectorial force */
1236 fix1 = _mm256_add_ps(fix1,tx);
1237 fiy1 = _mm256_add_ps(fiy1,ty);
1238 fiz1 = _mm256_add_ps(fiz1,tz);
1240 fjx3 = _mm256_add_ps(fjx3,tx);
1241 fjy3 = _mm256_add_ps(fjy3,ty);
1242 fjz3 = _mm256_add_ps(fjz3,tz);
1244 /**************************
1245 * CALCULATE INTERACTIONS *
1246 **************************/
1248 r21 = _mm256_mul_ps(rsq21,rinv21);
1249 r21 = _mm256_andnot_ps(dummy_mask,r21);
1251 /* Calculate table index by multiplying r with table scale and truncate to integer */
1252 rt = _mm256_mul_ps(r21,vftabscale);
1253 vfitab = _mm256_cvttps_epi32(rt);
1254 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1255 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1256 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1257 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1258 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1259 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1261 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1262 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1263 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1264 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1265 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1266 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1267 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1268 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1269 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1270 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1271 Heps = _mm256_mul_ps(vfeps,H);
1272 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1273 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1274 velec = _mm256_mul_ps(qq21,VV);
1275 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1276 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1278 /* Update potential sum for this i atom from the interaction with this j atom. */
1279 velec = _mm256_andnot_ps(dummy_mask,velec);
1280 velecsum = _mm256_add_ps(velecsum,velec);
1284 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1286 /* Calculate temporary vectorial force */
1287 tx = _mm256_mul_ps(fscal,dx21);
1288 ty = _mm256_mul_ps(fscal,dy21);
1289 tz = _mm256_mul_ps(fscal,dz21);
1291 /* Update vectorial force */
1292 fix2 = _mm256_add_ps(fix2,tx);
1293 fiy2 = _mm256_add_ps(fiy2,ty);
1294 fiz2 = _mm256_add_ps(fiz2,tz);
1296 fjx1 = _mm256_add_ps(fjx1,tx);
1297 fjy1 = _mm256_add_ps(fjy1,ty);
1298 fjz1 = _mm256_add_ps(fjz1,tz);
1300 /**************************
1301 * CALCULATE INTERACTIONS *
1302 **************************/
1304 r22 = _mm256_mul_ps(rsq22,rinv22);
1305 r22 = _mm256_andnot_ps(dummy_mask,r22);
1307 /* Calculate table index by multiplying r with table scale and truncate to integer */
1308 rt = _mm256_mul_ps(r22,vftabscale);
1309 vfitab = _mm256_cvttps_epi32(rt);
1310 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1311 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1312 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1313 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1314 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1315 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1317 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1318 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1319 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1320 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1321 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1322 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1323 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1324 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1325 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1326 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1327 Heps = _mm256_mul_ps(vfeps,H);
1328 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1329 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1330 velec = _mm256_mul_ps(qq22,VV);
1331 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1332 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1334 /* Update potential sum for this i atom from the interaction with this j atom. */
1335 velec = _mm256_andnot_ps(dummy_mask,velec);
1336 velecsum = _mm256_add_ps(velecsum,velec);
1340 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1342 /* Calculate temporary vectorial force */
1343 tx = _mm256_mul_ps(fscal,dx22);
1344 ty = _mm256_mul_ps(fscal,dy22);
1345 tz = _mm256_mul_ps(fscal,dz22);
1347 /* Update vectorial force */
1348 fix2 = _mm256_add_ps(fix2,tx);
1349 fiy2 = _mm256_add_ps(fiy2,ty);
1350 fiz2 = _mm256_add_ps(fiz2,tz);
1352 fjx2 = _mm256_add_ps(fjx2,tx);
1353 fjy2 = _mm256_add_ps(fjy2,ty);
1354 fjz2 = _mm256_add_ps(fjz2,tz);
1356 /**************************
1357 * CALCULATE INTERACTIONS *
1358 **************************/
1360 r23 = _mm256_mul_ps(rsq23,rinv23);
1361 r23 = _mm256_andnot_ps(dummy_mask,r23);
1363 /* Calculate table index by multiplying r with table scale and truncate to integer */
1364 rt = _mm256_mul_ps(r23,vftabscale);
1365 vfitab = _mm256_cvttps_epi32(rt);
1366 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1367 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1368 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1369 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1370 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1371 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1373 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1374 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1375 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1376 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1377 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1378 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1379 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1380 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1381 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1382 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1383 Heps = _mm256_mul_ps(vfeps,H);
1384 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1385 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1386 velec = _mm256_mul_ps(qq23,VV);
1387 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1388 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
1390 /* Update potential sum for this i atom from the interaction with this j atom. */
1391 velec = _mm256_andnot_ps(dummy_mask,velec);
1392 velecsum = _mm256_add_ps(velecsum,velec);
1396 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1398 /* Calculate temporary vectorial force */
1399 tx = _mm256_mul_ps(fscal,dx23);
1400 ty = _mm256_mul_ps(fscal,dy23);
1401 tz = _mm256_mul_ps(fscal,dz23);
1403 /* Update vectorial force */
1404 fix2 = _mm256_add_ps(fix2,tx);
1405 fiy2 = _mm256_add_ps(fiy2,ty);
1406 fiz2 = _mm256_add_ps(fiz2,tz);
1408 fjx3 = _mm256_add_ps(fjx3,tx);
1409 fjy3 = _mm256_add_ps(fjy3,ty);
1410 fjz3 = _mm256_add_ps(fjz3,tz);
1412 /**************************
1413 * CALCULATE INTERACTIONS *
1414 **************************/
1416 r31 = _mm256_mul_ps(rsq31,rinv31);
1417 r31 = _mm256_andnot_ps(dummy_mask,r31);
1419 /* Calculate table index by multiplying r with table scale and truncate to integer */
1420 rt = _mm256_mul_ps(r31,vftabscale);
1421 vfitab = _mm256_cvttps_epi32(rt);
1422 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1423 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1424 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1425 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1426 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1427 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1429 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1430 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1431 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1432 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1433 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1434 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1435 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1436 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1437 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1438 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1439 Heps = _mm256_mul_ps(vfeps,H);
1440 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1441 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1442 velec = _mm256_mul_ps(qq31,VV);
1443 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1444 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
1446 /* Update potential sum for this i atom from the interaction with this j atom. */
1447 velec = _mm256_andnot_ps(dummy_mask,velec);
1448 velecsum = _mm256_add_ps(velecsum,velec);
1452 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1454 /* Calculate temporary vectorial force */
1455 tx = _mm256_mul_ps(fscal,dx31);
1456 ty = _mm256_mul_ps(fscal,dy31);
1457 tz = _mm256_mul_ps(fscal,dz31);
1459 /* Update vectorial force */
1460 fix3 = _mm256_add_ps(fix3,tx);
1461 fiy3 = _mm256_add_ps(fiy3,ty);
1462 fiz3 = _mm256_add_ps(fiz3,tz);
1464 fjx1 = _mm256_add_ps(fjx1,tx);
1465 fjy1 = _mm256_add_ps(fjy1,ty);
1466 fjz1 = _mm256_add_ps(fjz1,tz);
1468 /**************************
1469 * CALCULATE INTERACTIONS *
1470 **************************/
1472 r32 = _mm256_mul_ps(rsq32,rinv32);
1473 r32 = _mm256_andnot_ps(dummy_mask,r32);
1475 /* Calculate table index by multiplying r with table scale and truncate to integer */
1476 rt = _mm256_mul_ps(r32,vftabscale);
1477 vfitab = _mm256_cvttps_epi32(rt);
1478 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1479 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1480 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1481 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1482 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1483 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1485 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1486 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1487 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1488 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1489 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1490 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1491 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1492 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1493 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1494 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1495 Heps = _mm256_mul_ps(vfeps,H);
1496 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1497 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1498 velec = _mm256_mul_ps(qq32,VV);
1499 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1500 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
1502 /* Update potential sum for this i atom from the interaction with this j atom. */
1503 velec = _mm256_andnot_ps(dummy_mask,velec);
1504 velecsum = _mm256_add_ps(velecsum,velec);
1508 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1510 /* Calculate temporary vectorial force */
1511 tx = _mm256_mul_ps(fscal,dx32);
1512 ty = _mm256_mul_ps(fscal,dy32);
1513 tz = _mm256_mul_ps(fscal,dz32);
1515 /* Update vectorial force */
1516 fix3 = _mm256_add_ps(fix3,tx);
1517 fiy3 = _mm256_add_ps(fiy3,ty);
1518 fiz3 = _mm256_add_ps(fiz3,tz);
1520 fjx2 = _mm256_add_ps(fjx2,tx);
1521 fjy2 = _mm256_add_ps(fjy2,ty);
1522 fjz2 = _mm256_add_ps(fjz2,tz);
1524 /**************************
1525 * CALCULATE INTERACTIONS *
1526 **************************/
1528 r33 = _mm256_mul_ps(rsq33,rinv33);
1529 r33 = _mm256_andnot_ps(dummy_mask,r33);
1531 /* Calculate table index by multiplying r with table scale and truncate to integer */
1532 rt = _mm256_mul_ps(r33,vftabscale);
1533 vfitab = _mm256_cvttps_epi32(rt);
1534 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1535 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1536 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1537 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1538 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1539 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1541 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1542 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1543 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1544 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1545 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1546 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1547 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1548 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1549 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1550 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1551 Heps = _mm256_mul_ps(vfeps,H);
1552 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1553 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1554 velec = _mm256_mul_ps(qq33,VV);
1555 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1556 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
1558 /* Update potential sum for this i atom from the interaction with this j atom. */
1559 velec = _mm256_andnot_ps(dummy_mask,velec);
1560 velecsum = _mm256_add_ps(velecsum,velec);
1564 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1566 /* Calculate temporary vectorial force */
1567 tx = _mm256_mul_ps(fscal,dx33);
1568 ty = _mm256_mul_ps(fscal,dy33);
1569 tz = _mm256_mul_ps(fscal,dz33);
1571 /* Update vectorial force */
1572 fix3 = _mm256_add_ps(fix3,tx);
1573 fiy3 = _mm256_add_ps(fiy3,ty);
1574 fiz3 = _mm256_add_ps(fiz3,tz);
1576 fjx3 = _mm256_add_ps(fjx3,tx);
1577 fjy3 = _mm256_add_ps(fjy3,ty);
1578 fjz3 = _mm256_add_ps(fjz3,tz);
1580 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1581 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1582 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1583 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1584 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1585 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1586 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1587 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1589 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1590 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1591 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1593 /* Inner loop uses 456 flops */
1596 /* End of innermost loop */
1598 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1599 f+i_coord_offset,fshift+i_shift_offset);
1602 /* Update potential energies */
1603 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1604 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1606 /* Increment number of inner iterations */
1607 inneriter += j_index_end - j_index_start;
1609 /* Outer loop uses 26 flops */
1612 /* Increment number of outer iterations */
1615 /* Update outer/inner flops */
1617 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*456);
1620 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_single
1621 * Electrostatics interaction: CubicSplineTable
1622 * VdW interaction: CubicSplineTable
1623 * Geometry: Water4-Water4
1624 * Calculate force/pot: Force
1627 nb_kernel_ElecCSTab_VdwCSTab_GeomW4W4_F_avx_256_single
1628 (t_nblist * gmx_restrict nlist,
1629 rvec * gmx_restrict xx,
1630 rvec * gmx_restrict ff,
1631 t_forcerec * gmx_restrict fr,
1632 t_mdatoms * gmx_restrict mdatoms,
1633 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1634 t_nrnb * gmx_restrict nrnb)
1636 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1637 * just 0 for non-waters.
1638 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1639 * jnr indices corresponding to data put in the four positions in the SIMD register.
1641 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1642 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1643 int jnrA,jnrB,jnrC,jnrD;
1644 int jnrE,jnrF,jnrG,jnrH;
1645 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1646 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1647 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1648 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1649 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1650 real rcutoff_scalar;
1651 real *shiftvec,*fshift,*x,*f;
1652 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1653 real scratch[4*DIM];
1654 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1655 real * vdwioffsetptr0;
1656 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1657 real * vdwioffsetptr1;
1658 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1659 real * vdwioffsetptr2;
1660 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1661 real * vdwioffsetptr3;
1662 __m256 ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1663 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1664 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1665 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1666 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1667 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1668 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1669 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D,vdwjidx3E,vdwjidx3F,vdwjidx3G,vdwjidx3H;
1670 __m256 jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1671 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1672 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1673 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1674 __m256 dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1675 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1676 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1677 __m256 dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1678 __m256 dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1679 __m256 dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1680 __m256 dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1681 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1684 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1687 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1688 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1690 __m128i vfitab_lo,vfitab_hi;
1691 __m128i ifour = _mm_set1_epi32(4);
1692 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1694 __m256 dummy_mask,cutoff_mask;
1695 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1696 __m256 one = _mm256_set1_ps(1.0);
1697 __m256 two = _mm256_set1_ps(2.0);
1703 jindex = nlist->jindex;
1705 shiftidx = nlist->shift;
1707 shiftvec = fr->shift_vec[0];
1708 fshift = fr->fshift[0];
1709 facel = _mm256_set1_ps(fr->epsfac);
1710 charge = mdatoms->chargeA;
1711 nvdwtype = fr->ntype;
1712 vdwparam = fr->nbfp;
1713 vdwtype = mdatoms->typeA;
1715 vftab = kernel_data->table_elec_vdw->data;
1716 vftabscale = _mm256_set1_ps(kernel_data->table_elec_vdw->scale);
1718 /* Setup water-specific parameters */
1719 inr = nlist->iinr[0];
1720 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1721 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1722 iq3 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+3]));
1723 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1725 jq1 = _mm256_set1_ps(charge[inr+1]);
1726 jq2 = _mm256_set1_ps(charge[inr+2]);
1727 jq3 = _mm256_set1_ps(charge[inr+3]);
1728 vdwjidx0A = 2*vdwtype[inr+0];
1729 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1730 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1731 qq11 = _mm256_mul_ps(iq1,jq1);
1732 qq12 = _mm256_mul_ps(iq1,jq2);
1733 qq13 = _mm256_mul_ps(iq1,jq3);
1734 qq21 = _mm256_mul_ps(iq2,jq1);
1735 qq22 = _mm256_mul_ps(iq2,jq2);
1736 qq23 = _mm256_mul_ps(iq2,jq3);
1737 qq31 = _mm256_mul_ps(iq3,jq1);
1738 qq32 = _mm256_mul_ps(iq3,jq2);
1739 qq33 = _mm256_mul_ps(iq3,jq3);
1741 /* Avoid stupid compiler warnings */
1742 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1743 j_coord_offsetA = 0;
1744 j_coord_offsetB = 0;
1745 j_coord_offsetC = 0;
1746 j_coord_offsetD = 0;
1747 j_coord_offsetE = 0;
1748 j_coord_offsetF = 0;
1749 j_coord_offsetG = 0;
1750 j_coord_offsetH = 0;
1755 for(iidx=0;iidx<4*DIM;iidx++)
1757 scratch[iidx] = 0.0;
1760 /* Start outer loop over neighborlists */
1761 for(iidx=0; iidx<nri; iidx++)
1763 /* Load shift vector for this list */
1764 i_shift_offset = DIM*shiftidx[iidx];
1766 /* Load limits for loop over neighbors */
1767 j_index_start = jindex[iidx];
1768 j_index_end = jindex[iidx+1];
1770 /* Get outer coordinate index */
1772 i_coord_offset = DIM*inr;
1774 /* Load i particle coords and add shift vector */
1775 gmx_mm256_load_shift_and_4rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1776 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1778 fix0 = _mm256_setzero_ps();
1779 fiy0 = _mm256_setzero_ps();
1780 fiz0 = _mm256_setzero_ps();
1781 fix1 = _mm256_setzero_ps();
1782 fiy1 = _mm256_setzero_ps();
1783 fiz1 = _mm256_setzero_ps();
1784 fix2 = _mm256_setzero_ps();
1785 fiy2 = _mm256_setzero_ps();
1786 fiz2 = _mm256_setzero_ps();
1787 fix3 = _mm256_setzero_ps();
1788 fiy3 = _mm256_setzero_ps();
1789 fiz3 = _mm256_setzero_ps();
1791 /* Start inner kernel loop */
1792 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1795 /* Get j neighbor index, and coordinate index */
1797 jnrB = jjnr[jidx+1];
1798 jnrC = jjnr[jidx+2];
1799 jnrD = jjnr[jidx+3];
1800 jnrE = jjnr[jidx+4];
1801 jnrF = jjnr[jidx+5];
1802 jnrG = jjnr[jidx+6];
1803 jnrH = jjnr[jidx+7];
1804 j_coord_offsetA = DIM*jnrA;
1805 j_coord_offsetB = DIM*jnrB;
1806 j_coord_offsetC = DIM*jnrC;
1807 j_coord_offsetD = DIM*jnrD;
1808 j_coord_offsetE = DIM*jnrE;
1809 j_coord_offsetF = DIM*jnrF;
1810 j_coord_offsetG = DIM*jnrG;
1811 j_coord_offsetH = DIM*jnrH;
1813 /* load j atom coordinates */
1814 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1815 x+j_coord_offsetC,x+j_coord_offsetD,
1816 x+j_coord_offsetE,x+j_coord_offsetF,
1817 x+j_coord_offsetG,x+j_coord_offsetH,
1818 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1819 &jy2,&jz2,&jx3,&jy3,&jz3);
1821 /* Calculate displacement vector */
1822 dx00 = _mm256_sub_ps(ix0,jx0);
1823 dy00 = _mm256_sub_ps(iy0,jy0);
1824 dz00 = _mm256_sub_ps(iz0,jz0);
1825 dx11 = _mm256_sub_ps(ix1,jx1);
1826 dy11 = _mm256_sub_ps(iy1,jy1);
1827 dz11 = _mm256_sub_ps(iz1,jz1);
1828 dx12 = _mm256_sub_ps(ix1,jx2);
1829 dy12 = _mm256_sub_ps(iy1,jy2);
1830 dz12 = _mm256_sub_ps(iz1,jz2);
1831 dx13 = _mm256_sub_ps(ix1,jx3);
1832 dy13 = _mm256_sub_ps(iy1,jy3);
1833 dz13 = _mm256_sub_ps(iz1,jz3);
1834 dx21 = _mm256_sub_ps(ix2,jx1);
1835 dy21 = _mm256_sub_ps(iy2,jy1);
1836 dz21 = _mm256_sub_ps(iz2,jz1);
1837 dx22 = _mm256_sub_ps(ix2,jx2);
1838 dy22 = _mm256_sub_ps(iy2,jy2);
1839 dz22 = _mm256_sub_ps(iz2,jz2);
1840 dx23 = _mm256_sub_ps(ix2,jx3);
1841 dy23 = _mm256_sub_ps(iy2,jy3);
1842 dz23 = _mm256_sub_ps(iz2,jz3);
1843 dx31 = _mm256_sub_ps(ix3,jx1);
1844 dy31 = _mm256_sub_ps(iy3,jy1);
1845 dz31 = _mm256_sub_ps(iz3,jz1);
1846 dx32 = _mm256_sub_ps(ix3,jx2);
1847 dy32 = _mm256_sub_ps(iy3,jy2);
1848 dz32 = _mm256_sub_ps(iz3,jz2);
1849 dx33 = _mm256_sub_ps(ix3,jx3);
1850 dy33 = _mm256_sub_ps(iy3,jy3);
1851 dz33 = _mm256_sub_ps(iz3,jz3);
1853 /* Calculate squared distance and things based on it */
1854 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1855 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1856 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1857 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
1858 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1859 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1860 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
1861 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
1862 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
1863 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
1865 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1866 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1867 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1868 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
1869 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1870 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1871 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
1872 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
1873 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
1874 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
1876 fjx0 = _mm256_setzero_ps();
1877 fjy0 = _mm256_setzero_ps();
1878 fjz0 = _mm256_setzero_ps();
1879 fjx1 = _mm256_setzero_ps();
1880 fjy1 = _mm256_setzero_ps();
1881 fjz1 = _mm256_setzero_ps();
1882 fjx2 = _mm256_setzero_ps();
1883 fjy2 = _mm256_setzero_ps();
1884 fjz2 = _mm256_setzero_ps();
1885 fjx3 = _mm256_setzero_ps();
1886 fjy3 = _mm256_setzero_ps();
1887 fjz3 = _mm256_setzero_ps();
1889 /**************************
1890 * CALCULATE INTERACTIONS *
1891 **************************/
1893 r00 = _mm256_mul_ps(rsq00,rinv00);
1895 /* Calculate table index by multiplying r with table scale and truncate to integer */
1896 rt = _mm256_mul_ps(r00,vftabscale);
1897 vfitab = _mm256_cvttps_epi32(rt);
1898 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1899 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1900 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1901 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1902 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1903 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1905 /* CUBIC SPLINE TABLE DISPERSION */
1906 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1907 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1908 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1909 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1910 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1911 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1912 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1913 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1914 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1915 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1916 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1917 Heps = _mm256_mul_ps(vfeps,H);
1918 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1919 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1920 fvdw6 = _mm256_mul_ps(c6_00,FF);
1922 /* CUBIC SPLINE TABLE REPULSION */
1923 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1924 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1925 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1926 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1927 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1928 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1929 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1930 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1931 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1932 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1933 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1934 Heps = _mm256_mul_ps(vfeps,H);
1935 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1936 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1937 fvdw12 = _mm256_mul_ps(c12_00,FF);
1938 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1942 /* Calculate temporary vectorial force */
1943 tx = _mm256_mul_ps(fscal,dx00);
1944 ty = _mm256_mul_ps(fscal,dy00);
1945 tz = _mm256_mul_ps(fscal,dz00);
1947 /* Update vectorial force */
1948 fix0 = _mm256_add_ps(fix0,tx);
1949 fiy0 = _mm256_add_ps(fiy0,ty);
1950 fiz0 = _mm256_add_ps(fiz0,tz);
1952 fjx0 = _mm256_add_ps(fjx0,tx);
1953 fjy0 = _mm256_add_ps(fjy0,ty);
1954 fjz0 = _mm256_add_ps(fjz0,tz);
1956 /**************************
1957 * CALCULATE INTERACTIONS *
1958 **************************/
1960 r11 = _mm256_mul_ps(rsq11,rinv11);
1962 /* Calculate table index by multiplying r with table scale and truncate to integer */
1963 rt = _mm256_mul_ps(r11,vftabscale);
1964 vfitab = _mm256_cvttps_epi32(rt);
1965 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1966 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1967 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1968 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1969 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
1970 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
1972 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1973 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1974 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1975 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1976 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1977 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1978 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1979 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1980 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1981 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1982 Heps = _mm256_mul_ps(vfeps,H);
1983 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1984 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1985 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1989 /* Calculate temporary vectorial force */
1990 tx = _mm256_mul_ps(fscal,dx11);
1991 ty = _mm256_mul_ps(fscal,dy11);
1992 tz = _mm256_mul_ps(fscal,dz11);
1994 /* Update vectorial force */
1995 fix1 = _mm256_add_ps(fix1,tx);
1996 fiy1 = _mm256_add_ps(fiy1,ty);
1997 fiz1 = _mm256_add_ps(fiz1,tz);
1999 fjx1 = _mm256_add_ps(fjx1,tx);
2000 fjy1 = _mm256_add_ps(fjy1,ty);
2001 fjz1 = _mm256_add_ps(fjz1,tz);
2003 /**************************
2004 * CALCULATE INTERACTIONS *
2005 **************************/
2007 r12 = _mm256_mul_ps(rsq12,rinv12);
2009 /* Calculate table index by multiplying r with table scale and truncate to integer */
2010 rt = _mm256_mul_ps(r12,vftabscale);
2011 vfitab = _mm256_cvttps_epi32(rt);
2012 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2013 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2014 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2015 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2016 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2017 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2019 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2020 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2021 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2022 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2023 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2024 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2025 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2026 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2027 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2028 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2029 Heps = _mm256_mul_ps(vfeps,H);
2030 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2031 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2032 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2036 /* Calculate temporary vectorial force */
2037 tx = _mm256_mul_ps(fscal,dx12);
2038 ty = _mm256_mul_ps(fscal,dy12);
2039 tz = _mm256_mul_ps(fscal,dz12);
2041 /* Update vectorial force */
2042 fix1 = _mm256_add_ps(fix1,tx);
2043 fiy1 = _mm256_add_ps(fiy1,ty);
2044 fiz1 = _mm256_add_ps(fiz1,tz);
2046 fjx2 = _mm256_add_ps(fjx2,tx);
2047 fjy2 = _mm256_add_ps(fjy2,ty);
2048 fjz2 = _mm256_add_ps(fjz2,tz);
2050 /**************************
2051 * CALCULATE INTERACTIONS *
2052 **************************/
2054 r13 = _mm256_mul_ps(rsq13,rinv13);
2056 /* Calculate table index by multiplying r with table scale and truncate to integer */
2057 rt = _mm256_mul_ps(r13,vftabscale);
2058 vfitab = _mm256_cvttps_epi32(rt);
2059 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2060 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2061 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2062 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2063 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2064 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2066 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2067 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2068 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2069 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2070 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2071 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2072 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2073 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2074 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2075 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2076 Heps = _mm256_mul_ps(vfeps,H);
2077 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2078 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2079 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
2083 /* Calculate temporary vectorial force */
2084 tx = _mm256_mul_ps(fscal,dx13);
2085 ty = _mm256_mul_ps(fscal,dy13);
2086 tz = _mm256_mul_ps(fscal,dz13);
2088 /* Update vectorial force */
2089 fix1 = _mm256_add_ps(fix1,tx);
2090 fiy1 = _mm256_add_ps(fiy1,ty);
2091 fiz1 = _mm256_add_ps(fiz1,tz);
2093 fjx3 = _mm256_add_ps(fjx3,tx);
2094 fjy3 = _mm256_add_ps(fjy3,ty);
2095 fjz3 = _mm256_add_ps(fjz3,tz);
2097 /**************************
2098 * CALCULATE INTERACTIONS *
2099 **************************/
2101 r21 = _mm256_mul_ps(rsq21,rinv21);
2103 /* Calculate table index by multiplying r with table scale and truncate to integer */
2104 rt = _mm256_mul_ps(r21,vftabscale);
2105 vfitab = _mm256_cvttps_epi32(rt);
2106 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2107 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2108 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2109 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2110 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2111 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2113 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2114 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2115 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2116 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2117 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2118 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2119 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2120 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2121 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2122 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2123 Heps = _mm256_mul_ps(vfeps,H);
2124 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2125 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2126 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2130 /* Calculate temporary vectorial force */
2131 tx = _mm256_mul_ps(fscal,dx21);
2132 ty = _mm256_mul_ps(fscal,dy21);
2133 tz = _mm256_mul_ps(fscal,dz21);
2135 /* Update vectorial force */
2136 fix2 = _mm256_add_ps(fix2,tx);
2137 fiy2 = _mm256_add_ps(fiy2,ty);
2138 fiz2 = _mm256_add_ps(fiz2,tz);
2140 fjx1 = _mm256_add_ps(fjx1,tx);
2141 fjy1 = _mm256_add_ps(fjy1,ty);
2142 fjz1 = _mm256_add_ps(fjz1,tz);
2144 /**************************
2145 * CALCULATE INTERACTIONS *
2146 **************************/
2148 r22 = _mm256_mul_ps(rsq22,rinv22);
2150 /* Calculate table index by multiplying r with table scale and truncate to integer */
2151 rt = _mm256_mul_ps(r22,vftabscale);
2152 vfitab = _mm256_cvttps_epi32(rt);
2153 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2154 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2155 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2156 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2157 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2158 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2160 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2161 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2162 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2163 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2164 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2165 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2166 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2167 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2168 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2169 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2170 Heps = _mm256_mul_ps(vfeps,H);
2171 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2172 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2173 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2177 /* Calculate temporary vectorial force */
2178 tx = _mm256_mul_ps(fscal,dx22);
2179 ty = _mm256_mul_ps(fscal,dy22);
2180 tz = _mm256_mul_ps(fscal,dz22);
2182 /* Update vectorial force */
2183 fix2 = _mm256_add_ps(fix2,tx);
2184 fiy2 = _mm256_add_ps(fiy2,ty);
2185 fiz2 = _mm256_add_ps(fiz2,tz);
2187 fjx2 = _mm256_add_ps(fjx2,tx);
2188 fjy2 = _mm256_add_ps(fjy2,ty);
2189 fjz2 = _mm256_add_ps(fjz2,tz);
2191 /**************************
2192 * CALCULATE INTERACTIONS *
2193 **************************/
2195 r23 = _mm256_mul_ps(rsq23,rinv23);
2197 /* Calculate table index by multiplying r with table scale and truncate to integer */
2198 rt = _mm256_mul_ps(r23,vftabscale);
2199 vfitab = _mm256_cvttps_epi32(rt);
2200 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2201 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2202 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2203 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2204 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2205 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2207 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2208 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2209 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2210 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2211 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2212 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2213 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2214 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2215 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2216 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2217 Heps = _mm256_mul_ps(vfeps,H);
2218 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2219 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2220 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
2224 /* Calculate temporary vectorial force */
2225 tx = _mm256_mul_ps(fscal,dx23);
2226 ty = _mm256_mul_ps(fscal,dy23);
2227 tz = _mm256_mul_ps(fscal,dz23);
2229 /* Update vectorial force */
2230 fix2 = _mm256_add_ps(fix2,tx);
2231 fiy2 = _mm256_add_ps(fiy2,ty);
2232 fiz2 = _mm256_add_ps(fiz2,tz);
2234 fjx3 = _mm256_add_ps(fjx3,tx);
2235 fjy3 = _mm256_add_ps(fjy3,ty);
2236 fjz3 = _mm256_add_ps(fjz3,tz);
2238 /**************************
2239 * CALCULATE INTERACTIONS *
2240 **************************/
2242 r31 = _mm256_mul_ps(rsq31,rinv31);
2244 /* Calculate table index by multiplying r with table scale and truncate to integer */
2245 rt = _mm256_mul_ps(r31,vftabscale);
2246 vfitab = _mm256_cvttps_epi32(rt);
2247 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2248 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2249 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2250 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2251 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2252 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2254 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2255 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2256 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2257 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2258 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2259 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2260 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2261 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2262 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2263 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2264 Heps = _mm256_mul_ps(vfeps,H);
2265 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2266 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2267 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
2271 /* Calculate temporary vectorial force */
2272 tx = _mm256_mul_ps(fscal,dx31);
2273 ty = _mm256_mul_ps(fscal,dy31);
2274 tz = _mm256_mul_ps(fscal,dz31);
2276 /* Update vectorial force */
2277 fix3 = _mm256_add_ps(fix3,tx);
2278 fiy3 = _mm256_add_ps(fiy3,ty);
2279 fiz3 = _mm256_add_ps(fiz3,tz);
2281 fjx1 = _mm256_add_ps(fjx1,tx);
2282 fjy1 = _mm256_add_ps(fjy1,ty);
2283 fjz1 = _mm256_add_ps(fjz1,tz);
2285 /**************************
2286 * CALCULATE INTERACTIONS *
2287 **************************/
2289 r32 = _mm256_mul_ps(rsq32,rinv32);
2291 /* Calculate table index by multiplying r with table scale and truncate to integer */
2292 rt = _mm256_mul_ps(r32,vftabscale);
2293 vfitab = _mm256_cvttps_epi32(rt);
2294 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2295 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2296 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2297 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2298 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2299 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2301 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2302 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2303 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2304 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2305 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2306 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2307 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2308 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2309 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2310 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2311 Heps = _mm256_mul_ps(vfeps,H);
2312 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2313 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2314 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2318 /* Calculate temporary vectorial force */
2319 tx = _mm256_mul_ps(fscal,dx32);
2320 ty = _mm256_mul_ps(fscal,dy32);
2321 tz = _mm256_mul_ps(fscal,dz32);
2323 /* Update vectorial force */
2324 fix3 = _mm256_add_ps(fix3,tx);
2325 fiy3 = _mm256_add_ps(fiy3,ty);
2326 fiz3 = _mm256_add_ps(fiz3,tz);
2328 fjx2 = _mm256_add_ps(fjx2,tx);
2329 fjy2 = _mm256_add_ps(fjy2,ty);
2330 fjz2 = _mm256_add_ps(fjz2,tz);
2332 /**************************
2333 * CALCULATE INTERACTIONS *
2334 **************************/
2336 r33 = _mm256_mul_ps(rsq33,rinv33);
2338 /* Calculate table index by multiplying r with table scale and truncate to integer */
2339 rt = _mm256_mul_ps(r33,vftabscale);
2340 vfitab = _mm256_cvttps_epi32(rt);
2341 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2342 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2343 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2344 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2345 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2346 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2348 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2349 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2350 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2351 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2352 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2353 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2354 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2355 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2356 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2357 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2358 Heps = _mm256_mul_ps(vfeps,H);
2359 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2360 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2361 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
2365 /* Calculate temporary vectorial force */
2366 tx = _mm256_mul_ps(fscal,dx33);
2367 ty = _mm256_mul_ps(fscal,dy33);
2368 tz = _mm256_mul_ps(fscal,dz33);
2370 /* Update vectorial force */
2371 fix3 = _mm256_add_ps(fix3,tx);
2372 fiy3 = _mm256_add_ps(fiy3,ty);
2373 fiz3 = _mm256_add_ps(fiz3,tz);
2375 fjx3 = _mm256_add_ps(fjx3,tx);
2376 fjy3 = _mm256_add_ps(fjy3,ty);
2377 fjz3 = _mm256_add_ps(fjz3,tz);
2379 fjptrA = f+j_coord_offsetA;
2380 fjptrB = f+j_coord_offsetB;
2381 fjptrC = f+j_coord_offsetC;
2382 fjptrD = f+j_coord_offsetD;
2383 fjptrE = f+j_coord_offsetE;
2384 fjptrF = f+j_coord_offsetF;
2385 fjptrG = f+j_coord_offsetG;
2386 fjptrH = f+j_coord_offsetH;
2388 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2389 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
2390 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
2392 /* Inner loop uses 402 flops */
2395 if(jidx<j_index_end)
2398 /* Get j neighbor index, and coordinate index */
2399 jnrlistA = jjnr[jidx];
2400 jnrlistB = jjnr[jidx+1];
2401 jnrlistC = jjnr[jidx+2];
2402 jnrlistD = jjnr[jidx+3];
2403 jnrlistE = jjnr[jidx+4];
2404 jnrlistF = jjnr[jidx+5];
2405 jnrlistG = jjnr[jidx+6];
2406 jnrlistH = jjnr[jidx+7];
2407 /* Sign of each element will be negative for non-real atoms.
2408 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2409 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2411 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2412 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2414 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2415 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2416 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2417 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2418 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2419 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2420 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2421 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2422 j_coord_offsetA = DIM*jnrA;
2423 j_coord_offsetB = DIM*jnrB;
2424 j_coord_offsetC = DIM*jnrC;
2425 j_coord_offsetD = DIM*jnrD;
2426 j_coord_offsetE = DIM*jnrE;
2427 j_coord_offsetF = DIM*jnrF;
2428 j_coord_offsetG = DIM*jnrG;
2429 j_coord_offsetH = DIM*jnrH;
2431 /* load j atom coordinates */
2432 gmx_mm256_load_4rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2433 x+j_coord_offsetC,x+j_coord_offsetD,
2434 x+j_coord_offsetE,x+j_coord_offsetF,
2435 x+j_coord_offsetG,x+j_coord_offsetH,
2436 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
2437 &jy2,&jz2,&jx3,&jy3,&jz3);
2439 /* Calculate displacement vector */
2440 dx00 = _mm256_sub_ps(ix0,jx0);
2441 dy00 = _mm256_sub_ps(iy0,jy0);
2442 dz00 = _mm256_sub_ps(iz0,jz0);
2443 dx11 = _mm256_sub_ps(ix1,jx1);
2444 dy11 = _mm256_sub_ps(iy1,jy1);
2445 dz11 = _mm256_sub_ps(iz1,jz1);
2446 dx12 = _mm256_sub_ps(ix1,jx2);
2447 dy12 = _mm256_sub_ps(iy1,jy2);
2448 dz12 = _mm256_sub_ps(iz1,jz2);
2449 dx13 = _mm256_sub_ps(ix1,jx3);
2450 dy13 = _mm256_sub_ps(iy1,jy3);
2451 dz13 = _mm256_sub_ps(iz1,jz3);
2452 dx21 = _mm256_sub_ps(ix2,jx1);
2453 dy21 = _mm256_sub_ps(iy2,jy1);
2454 dz21 = _mm256_sub_ps(iz2,jz1);
2455 dx22 = _mm256_sub_ps(ix2,jx2);
2456 dy22 = _mm256_sub_ps(iy2,jy2);
2457 dz22 = _mm256_sub_ps(iz2,jz2);
2458 dx23 = _mm256_sub_ps(ix2,jx3);
2459 dy23 = _mm256_sub_ps(iy2,jy3);
2460 dz23 = _mm256_sub_ps(iz2,jz3);
2461 dx31 = _mm256_sub_ps(ix3,jx1);
2462 dy31 = _mm256_sub_ps(iy3,jy1);
2463 dz31 = _mm256_sub_ps(iz3,jz1);
2464 dx32 = _mm256_sub_ps(ix3,jx2);
2465 dy32 = _mm256_sub_ps(iy3,jy2);
2466 dz32 = _mm256_sub_ps(iz3,jz2);
2467 dx33 = _mm256_sub_ps(ix3,jx3);
2468 dy33 = _mm256_sub_ps(iy3,jy3);
2469 dz33 = _mm256_sub_ps(iz3,jz3);
2471 /* Calculate squared distance and things based on it */
2472 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2473 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2474 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2475 rsq13 = gmx_mm256_calc_rsq_ps(dx13,dy13,dz13);
2476 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2477 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2478 rsq23 = gmx_mm256_calc_rsq_ps(dx23,dy23,dz23);
2479 rsq31 = gmx_mm256_calc_rsq_ps(dx31,dy31,dz31);
2480 rsq32 = gmx_mm256_calc_rsq_ps(dx32,dy32,dz32);
2481 rsq33 = gmx_mm256_calc_rsq_ps(dx33,dy33,dz33);
2483 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2484 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2485 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2486 rinv13 = gmx_mm256_invsqrt_ps(rsq13);
2487 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2488 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2489 rinv23 = gmx_mm256_invsqrt_ps(rsq23);
2490 rinv31 = gmx_mm256_invsqrt_ps(rsq31);
2491 rinv32 = gmx_mm256_invsqrt_ps(rsq32);
2492 rinv33 = gmx_mm256_invsqrt_ps(rsq33);
2494 fjx0 = _mm256_setzero_ps();
2495 fjy0 = _mm256_setzero_ps();
2496 fjz0 = _mm256_setzero_ps();
2497 fjx1 = _mm256_setzero_ps();
2498 fjy1 = _mm256_setzero_ps();
2499 fjz1 = _mm256_setzero_ps();
2500 fjx2 = _mm256_setzero_ps();
2501 fjy2 = _mm256_setzero_ps();
2502 fjz2 = _mm256_setzero_ps();
2503 fjx3 = _mm256_setzero_ps();
2504 fjy3 = _mm256_setzero_ps();
2505 fjz3 = _mm256_setzero_ps();
2507 /**************************
2508 * CALCULATE INTERACTIONS *
2509 **************************/
2511 r00 = _mm256_mul_ps(rsq00,rinv00);
2512 r00 = _mm256_andnot_ps(dummy_mask,r00);
2514 /* Calculate table index by multiplying r with table scale and truncate to integer */
2515 rt = _mm256_mul_ps(r00,vftabscale);
2516 vfitab = _mm256_cvttps_epi32(rt);
2517 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2518 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2519 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2520 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2521 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2522 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2524 /* CUBIC SPLINE TABLE DISPERSION */
2525 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2526 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2527 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2528 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2529 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2530 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2531 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2532 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2533 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2534 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2535 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2536 Heps = _mm256_mul_ps(vfeps,H);
2537 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2538 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2539 fvdw6 = _mm256_mul_ps(c6_00,FF);
2541 /* CUBIC SPLINE TABLE REPULSION */
2542 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2543 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2544 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2545 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2546 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2547 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2548 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2549 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2550 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2551 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2552 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2553 Heps = _mm256_mul_ps(vfeps,H);
2554 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2555 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2556 fvdw12 = _mm256_mul_ps(c12_00,FF);
2557 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
2561 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2563 /* Calculate temporary vectorial force */
2564 tx = _mm256_mul_ps(fscal,dx00);
2565 ty = _mm256_mul_ps(fscal,dy00);
2566 tz = _mm256_mul_ps(fscal,dz00);
2568 /* Update vectorial force */
2569 fix0 = _mm256_add_ps(fix0,tx);
2570 fiy0 = _mm256_add_ps(fiy0,ty);
2571 fiz0 = _mm256_add_ps(fiz0,tz);
2573 fjx0 = _mm256_add_ps(fjx0,tx);
2574 fjy0 = _mm256_add_ps(fjy0,ty);
2575 fjz0 = _mm256_add_ps(fjz0,tz);
2577 /**************************
2578 * CALCULATE INTERACTIONS *
2579 **************************/
2581 r11 = _mm256_mul_ps(rsq11,rinv11);
2582 r11 = _mm256_andnot_ps(dummy_mask,r11);
2584 /* Calculate table index by multiplying r with table scale and truncate to integer */
2585 rt = _mm256_mul_ps(r11,vftabscale);
2586 vfitab = _mm256_cvttps_epi32(rt);
2587 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2588 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2589 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2590 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2591 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2592 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2594 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2595 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2596 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2597 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2598 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2599 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2600 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2601 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2602 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2603 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2604 Heps = _mm256_mul_ps(vfeps,H);
2605 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2606 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2607 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2611 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2613 /* Calculate temporary vectorial force */
2614 tx = _mm256_mul_ps(fscal,dx11);
2615 ty = _mm256_mul_ps(fscal,dy11);
2616 tz = _mm256_mul_ps(fscal,dz11);
2618 /* Update vectorial force */
2619 fix1 = _mm256_add_ps(fix1,tx);
2620 fiy1 = _mm256_add_ps(fiy1,ty);
2621 fiz1 = _mm256_add_ps(fiz1,tz);
2623 fjx1 = _mm256_add_ps(fjx1,tx);
2624 fjy1 = _mm256_add_ps(fjy1,ty);
2625 fjz1 = _mm256_add_ps(fjz1,tz);
2627 /**************************
2628 * CALCULATE INTERACTIONS *
2629 **************************/
2631 r12 = _mm256_mul_ps(rsq12,rinv12);
2632 r12 = _mm256_andnot_ps(dummy_mask,r12);
2634 /* Calculate table index by multiplying r with table scale and truncate to integer */
2635 rt = _mm256_mul_ps(r12,vftabscale);
2636 vfitab = _mm256_cvttps_epi32(rt);
2637 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2638 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2639 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2640 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2641 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2642 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2644 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2645 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2646 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2647 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2648 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2649 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2650 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2651 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2652 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2653 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2654 Heps = _mm256_mul_ps(vfeps,H);
2655 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2656 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2657 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2661 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2663 /* Calculate temporary vectorial force */
2664 tx = _mm256_mul_ps(fscal,dx12);
2665 ty = _mm256_mul_ps(fscal,dy12);
2666 tz = _mm256_mul_ps(fscal,dz12);
2668 /* Update vectorial force */
2669 fix1 = _mm256_add_ps(fix1,tx);
2670 fiy1 = _mm256_add_ps(fiy1,ty);
2671 fiz1 = _mm256_add_ps(fiz1,tz);
2673 fjx2 = _mm256_add_ps(fjx2,tx);
2674 fjy2 = _mm256_add_ps(fjy2,ty);
2675 fjz2 = _mm256_add_ps(fjz2,tz);
2677 /**************************
2678 * CALCULATE INTERACTIONS *
2679 **************************/
2681 r13 = _mm256_mul_ps(rsq13,rinv13);
2682 r13 = _mm256_andnot_ps(dummy_mask,r13);
2684 /* Calculate table index by multiplying r with table scale and truncate to integer */
2685 rt = _mm256_mul_ps(r13,vftabscale);
2686 vfitab = _mm256_cvttps_epi32(rt);
2687 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2688 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2689 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2690 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2691 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2692 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2694 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2695 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2696 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2697 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2698 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2699 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2700 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2701 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2702 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2703 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2704 Heps = _mm256_mul_ps(vfeps,H);
2705 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2706 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2707 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq13,FF),_mm256_mul_ps(vftabscale,rinv13)));
2711 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2713 /* Calculate temporary vectorial force */
2714 tx = _mm256_mul_ps(fscal,dx13);
2715 ty = _mm256_mul_ps(fscal,dy13);
2716 tz = _mm256_mul_ps(fscal,dz13);
2718 /* Update vectorial force */
2719 fix1 = _mm256_add_ps(fix1,tx);
2720 fiy1 = _mm256_add_ps(fiy1,ty);
2721 fiz1 = _mm256_add_ps(fiz1,tz);
2723 fjx3 = _mm256_add_ps(fjx3,tx);
2724 fjy3 = _mm256_add_ps(fjy3,ty);
2725 fjz3 = _mm256_add_ps(fjz3,tz);
2727 /**************************
2728 * CALCULATE INTERACTIONS *
2729 **************************/
2731 r21 = _mm256_mul_ps(rsq21,rinv21);
2732 r21 = _mm256_andnot_ps(dummy_mask,r21);
2734 /* Calculate table index by multiplying r with table scale and truncate to integer */
2735 rt = _mm256_mul_ps(r21,vftabscale);
2736 vfitab = _mm256_cvttps_epi32(rt);
2737 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2738 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2739 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2740 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2741 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2742 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2744 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2745 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2746 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2747 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2748 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2749 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2750 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2751 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2752 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2753 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2754 Heps = _mm256_mul_ps(vfeps,H);
2755 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2756 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2757 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2761 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2763 /* Calculate temporary vectorial force */
2764 tx = _mm256_mul_ps(fscal,dx21);
2765 ty = _mm256_mul_ps(fscal,dy21);
2766 tz = _mm256_mul_ps(fscal,dz21);
2768 /* Update vectorial force */
2769 fix2 = _mm256_add_ps(fix2,tx);
2770 fiy2 = _mm256_add_ps(fiy2,ty);
2771 fiz2 = _mm256_add_ps(fiz2,tz);
2773 fjx1 = _mm256_add_ps(fjx1,tx);
2774 fjy1 = _mm256_add_ps(fjy1,ty);
2775 fjz1 = _mm256_add_ps(fjz1,tz);
2777 /**************************
2778 * CALCULATE INTERACTIONS *
2779 **************************/
2781 r22 = _mm256_mul_ps(rsq22,rinv22);
2782 r22 = _mm256_andnot_ps(dummy_mask,r22);
2784 /* Calculate table index by multiplying r with table scale and truncate to integer */
2785 rt = _mm256_mul_ps(r22,vftabscale);
2786 vfitab = _mm256_cvttps_epi32(rt);
2787 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2788 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2789 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2790 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2791 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2792 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2794 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2795 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2796 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2797 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2798 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2799 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2800 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2801 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2802 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2803 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2804 Heps = _mm256_mul_ps(vfeps,H);
2805 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2806 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2807 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2811 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2813 /* Calculate temporary vectorial force */
2814 tx = _mm256_mul_ps(fscal,dx22);
2815 ty = _mm256_mul_ps(fscal,dy22);
2816 tz = _mm256_mul_ps(fscal,dz22);
2818 /* Update vectorial force */
2819 fix2 = _mm256_add_ps(fix2,tx);
2820 fiy2 = _mm256_add_ps(fiy2,ty);
2821 fiz2 = _mm256_add_ps(fiz2,tz);
2823 fjx2 = _mm256_add_ps(fjx2,tx);
2824 fjy2 = _mm256_add_ps(fjy2,ty);
2825 fjz2 = _mm256_add_ps(fjz2,tz);
2827 /**************************
2828 * CALCULATE INTERACTIONS *
2829 **************************/
2831 r23 = _mm256_mul_ps(rsq23,rinv23);
2832 r23 = _mm256_andnot_ps(dummy_mask,r23);
2834 /* Calculate table index by multiplying r with table scale and truncate to integer */
2835 rt = _mm256_mul_ps(r23,vftabscale);
2836 vfitab = _mm256_cvttps_epi32(rt);
2837 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2838 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2839 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2840 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2841 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2842 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2844 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2845 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2846 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2847 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2848 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2849 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2850 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2851 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2852 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2853 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2854 Heps = _mm256_mul_ps(vfeps,H);
2855 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2856 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2857 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq23,FF),_mm256_mul_ps(vftabscale,rinv23)));
2861 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2863 /* Calculate temporary vectorial force */
2864 tx = _mm256_mul_ps(fscal,dx23);
2865 ty = _mm256_mul_ps(fscal,dy23);
2866 tz = _mm256_mul_ps(fscal,dz23);
2868 /* Update vectorial force */
2869 fix2 = _mm256_add_ps(fix2,tx);
2870 fiy2 = _mm256_add_ps(fiy2,ty);
2871 fiz2 = _mm256_add_ps(fiz2,tz);
2873 fjx3 = _mm256_add_ps(fjx3,tx);
2874 fjy3 = _mm256_add_ps(fjy3,ty);
2875 fjz3 = _mm256_add_ps(fjz3,tz);
2877 /**************************
2878 * CALCULATE INTERACTIONS *
2879 **************************/
2881 r31 = _mm256_mul_ps(rsq31,rinv31);
2882 r31 = _mm256_andnot_ps(dummy_mask,r31);
2884 /* Calculate table index by multiplying r with table scale and truncate to integer */
2885 rt = _mm256_mul_ps(r31,vftabscale);
2886 vfitab = _mm256_cvttps_epi32(rt);
2887 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2888 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2889 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2890 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2891 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2892 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2894 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2895 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2896 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2897 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2898 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2899 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2900 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2901 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2902 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2903 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2904 Heps = _mm256_mul_ps(vfeps,H);
2905 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2906 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2907 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq31,FF),_mm256_mul_ps(vftabscale,rinv31)));
2911 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2913 /* Calculate temporary vectorial force */
2914 tx = _mm256_mul_ps(fscal,dx31);
2915 ty = _mm256_mul_ps(fscal,dy31);
2916 tz = _mm256_mul_ps(fscal,dz31);
2918 /* Update vectorial force */
2919 fix3 = _mm256_add_ps(fix3,tx);
2920 fiy3 = _mm256_add_ps(fiy3,ty);
2921 fiz3 = _mm256_add_ps(fiz3,tz);
2923 fjx1 = _mm256_add_ps(fjx1,tx);
2924 fjy1 = _mm256_add_ps(fjy1,ty);
2925 fjz1 = _mm256_add_ps(fjz1,tz);
2927 /**************************
2928 * CALCULATE INTERACTIONS *
2929 **************************/
2931 r32 = _mm256_mul_ps(rsq32,rinv32);
2932 r32 = _mm256_andnot_ps(dummy_mask,r32);
2934 /* Calculate table index by multiplying r with table scale and truncate to integer */
2935 rt = _mm256_mul_ps(r32,vftabscale);
2936 vfitab = _mm256_cvttps_epi32(rt);
2937 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2938 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2939 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2940 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2941 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2942 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2944 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2945 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2946 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2947 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2948 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2949 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2950 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2951 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2952 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2953 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2954 Heps = _mm256_mul_ps(vfeps,H);
2955 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2956 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2957 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq32,FF),_mm256_mul_ps(vftabscale,rinv32)));
2961 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2963 /* Calculate temporary vectorial force */
2964 tx = _mm256_mul_ps(fscal,dx32);
2965 ty = _mm256_mul_ps(fscal,dy32);
2966 tz = _mm256_mul_ps(fscal,dz32);
2968 /* Update vectorial force */
2969 fix3 = _mm256_add_ps(fix3,tx);
2970 fiy3 = _mm256_add_ps(fiy3,ty);
2971 fiz3 = _mm256_add_ps(fiz3,tz);
2973 fjx2 = _mm256_add_ps(fjx2,tx);
2974 fjy2 = _mm256_add_ps(fjy2,ty);
2975 fjz2 = _mm256_add_ps(fjz2,tz);
2977 /**************************
2978 * CALCULATE INTERACTIONS *
2979 **************************/
2981 r33 = _mm256_mul_ps(rsq33,rinv33);
2982 r33 = _mm256_andnot_ps(dummy_mask,r33);
2984 /* Calculate table index by multiplying r with table scale and truncate to integer */
2985 rt = _mm256_mul_ps(r33,vftabscale);
2986 vfitab = _mm256_cvttps_epi32(rt);
2987 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2988 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2989 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2990 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2991 vfitab_lo = _mm_slli_epi32(_mm_add_epi32(vfitab_lo,_mm_slli_epi32(vfitab_lo,1)),2);
2992 vfitab_hi = _mm_slli_epi32(_mm_add_epi32(vfitab_hi,_mm_slli_epi32(vfitab_hi,1)),2);
2994 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2995 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2996 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2997 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2998 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2999 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
3000 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
3001 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
3002 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
3003 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
3004 Heps = _mm256_mul_ps(vfeps,H);
3005 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
3006 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
3007 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq33,FF),_mm256_mul_ps(vftabscale,rinv33)));
3011 fscal = _mm256_andnot_ps(dummy_mask,fscal);
3013 /* Calculate temporary vectorial force */
3014 tx = _mm256_mul_ps(fscal,dx33);
3015 ty = _mm256_mul_ps(fscal,dy33);
3016 tz = _mm256_mul_ps(fscal,dz33);
3018 /* Update vectorial force */
3019 fix3 = _mm256_add_ps(fix3,tx);
3020 fiy3 = _mm256_add_ps(fiy3,ty);
3021 fiz3 = _mm256_add_ps(fiz3,tz);
3023 fjx3 = _mm256_add_ps(fjx3,tx);
3024 fjy3 = _mm256_add_ps(fjy3,ty);
3025 fjz3 = _mm256_add_ps(fjz3,tz);
3027 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
3028 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
3029 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
3030 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
3031 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
3032 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
3033 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
3034 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
3036 gmx_mm256_decrement_4rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
3037 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
3038 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
3040 /* Inner loop uses 412 flops */
3043 /* End of innermost loop */
3045 gmx_mm256_update_iforce_4atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
3046 f+i_coord_offset,fshift+i_shift_offset);
3048 /* Increment number of inner iterations */
3049 inneriter += j_index_end - j_index_start;
3051 /* Outer loop uses 24 flops */
3054 /* Increment number of outer iterations */
3057 /* Update outer/inner flops */
3059 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*412);