2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_single kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_single.h"
48 #include "kernelutil_x86_avx_256_single.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single
52 * Electrostatics interaction: CubicSplineTable
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Water3
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_VF_avx_256_single
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrE,jnrF,jnrG,jnrH;
76 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
77 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
78 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
79 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
80 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
82 real *shiftvec,*fshift,*x,*f;
83 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
85 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
86 real * vdwioffsetptr0;
87 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
88 real * vdwioffsetptr1;
89 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
90 real * vdwioffsetptr2;
91 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
92 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
93 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
94 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
95 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
96 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
97 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
98 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
99 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
100 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
101 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
102 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
103 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
104 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
105 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
106 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
107 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
110 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
113 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
114 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
116 __m128i vfitab_lo,vfitab_hi;
117 __m128i ifour = _mm_set1_epi32(4);
118 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
120 __m256 dummy_mask,cutoff_mask;
121 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
122 __m256 one = _mm256_set1_ps(1.0);
123 __m256 two = _mm256_set1_ps(2.0);
129 jindex = nlist->jindex;
131 shiftidx = nlist->shift;
133 shiftvec = fr->shift_vec[0];
134 fshift = fr->fshift[0];
135 facel = _mm256_set1_ps(fr->epsfac);
136 charge = mdatoms->chargeA;
137 nvdwtype = fr->ntype;
139 vdwtype = mdatoms->typeA;
141 vftab = kernel_data->table_elec->data;
142 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
144 /* Setup water-specific parameters */
145 inr = nlist->iinr[0];
146 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
147 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
148 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
149 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
151 jq0 = _mm256_set1_ps(charge[inr+0]);
152 jq1 = _mm256_set1_ps(charge[inr+1]);
153 jq2 = _mm256_set1_ps(charge[inr+2]);
154 vdwjidx0A = 2*vdwtype[inr+0];
155 qq00 = _mm256_mul_ps(iq0,jq0);
156 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
157 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
158 qq01 = _mm256_mul_ps(iq0,jq1);
159 qq02 = _mm256_mul_ps(iq0,jq2);
160 qq10 = _mm256_mul_ps(iq1,jq0);
161 qq11 = _mm256_mul_ps(iq1,jq1);
162 qq12 = _mm256_mul_ps(iq1,jq2);
163 qq20 = _mm256_mul_ps(iq2,jq0);
164 qq21 = _mm256_mul_ps(iq2,jq1);
165 qq22 = _mm256_mul_ps(iq2,jq2);
167 /* Avoid stupid compiler warnings */
168 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
181 for(iidx=0;iidx<4*DIM;iidx++)
186 /* Start outer loop over neighborlists */
187 for(iidx=0; iidx<nri; iidx++)
189 /* Load shift vector for this list */
190 i_shift_offset = DIM*shiftidx[iidx];
192 /* Load limits for loop over neighbors */
193 j_index_start = jindex[iidx];
194 j_index_end = jindex[iidx+1];
196 /* Get outer coordinate index */
198 i_coord_offset = DIM*inr;
200 /* Load i particle coords and add shift vector */
201 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
202 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
204 fix0 = _mm256_setzero_ps();
205 fiy0 = _mm256_setzero_ps();
206 fiz0 = _mm256_setzero_ps();
207 fix1 = _mm256_setzero_ps();
208 fiy1 = _mm256_setzero_ps();
209 fiz1 = _mm256_setzero_ps();
210 fix2 = _mm256_setzero_ps();
211 fiy2 = _mm256_setzero_ps();
212 fiz2 = _mm256_setzero_ps();
214 /* Reset potential sums */
215 velecsum = _mm256_setzero_ps();
216 vvdwsum = _mm256_setzero_ps();
218 /* Start inner kernel loop */
219 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
222 /* Get j neighbor index, and coordinate index */
231 j_coord_offsetA = DIM*jnrA;
232 j_coord_offsetB = DIM*jnrB;
233 j_coord_offsetC = DIM*jnrC;
234 j_coord_offsetD = DIM*jnrD;
235 j_coord_offsetE = DIM*jnrE;
236 j_coord_offsetF = DIM*jnrF;
237 j_coord_offsetG = DIM*jnrG;
238 j_coord_offsetH = DIM*jnrH;
240 /* load j atom coordinates */
241 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
242 x+j_coord_offsetC,x+j_coord_offsetD,
243 x+j_coord_offsetE,x+j_coord_offsetF,
244 x+j_coord_offsetG,x+j_coord_offsetH,
245 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
247 /* Calculate displacement vector */
248 dx00 = _mm256_sub_ps(ix0,jx0);
249 dy00 = _mm256_sub_ps(iy0,jy0);
250 dz00 = _mm256_sub_ps(iz0,jz0);
251 dx01 = _mm256_sub_ps(ix0,jx1);
252 dy01 = _mm256_sub_ps(iy0,jy1);
253 dz01 = _mm256_sub_ps(iz0,jz1);
254 dx02 = _mm256_sub_ps(ix0,jx2);
255 dy02 = _mm256_sub_ps(iy0,jy2);
256 dz02 = _mm256_sub_ps(iz0,jz2);
257 dx10 = _mm256_sub_ps(ix1,jx0);
258 dy10 = _mm256_sub_ps(iy1,jy0);
259 dz10 = _mm256_sub_ps(iz1,jz0);
260 dx11 = _mm256_sub_ps(ix1,jx1);
261 dy11 = _mm256_sub_ps(iy1,jy1);
262 dz11 = _mm256_sub_ps(iz1,jz1);
263 dx12 = _mm256_sub_ps(ix1,jx2);
264 dy12 = _mm256_sub_ps(iy1,jy2);
265 dz12 = _mm256_sub_ps(iz1,jz2);
266 dx20 = _mm256_sub_ps(ix2,jx0);
267 dy20 = _mm256_sub_ps(iy2,jy0);
268 dz20 = _mm256_sub_ps(iz2,jz0);
269 dx21 = _mm256_sub_ps(ix2,jx1);
270 dy21 = _mm256_sub_ps(iy2,jy1);
271 dz21 = _mm256_sub_ps(iz2,jz1);
272 dx22 = _mm256_sub_ps(ix2,jx2);
273 dy22 = _mm256_sub_ps(iy2,jy2);
274 dz22 = _mm256_sub_ps(iz2,jz2);
276 /* Calculate squared distance and things based on it */
277 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
278 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
279 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
280 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
281 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
282 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
283 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
284 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
285 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
287 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
288 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
289 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
290 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
291 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
292 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
293 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
294 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
295 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
297 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
299 fjx0 = _mm256_setzero_ps();
300 fjy0 = _mm256_setzero_ps();
301 fjz0 = _mm256_setzero_ps();
302 fjx1 = _mm256_setzero_ps();
303 fjy1 = _mm256_setzero_ps();
304 fjz1 = _mm256_setzero_ps();
305 fjx2 = _mm256_setzero_ps();
306 fjy2 = _mm256_setzero_ps();
307 fjz2 = _mm256_setzero_ps();
309 /**************************
310 * CALCULATE INTERACTIONS *
311 **************************/
313 r00 = _mm256_mul_ps(rsq00,rinv00);
315 /* Calculate table index by multiplying r with table scale and truncate to integer */
316 rt = _mm256_mul_ps(r00,vftabscale);
317 vfitab = _mm256_cvttps_epi32(rt);
318 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
319 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
320 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
321 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
322 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
323 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
325 /* CUBIC SPLINE TABLE ELECTROSTATICS */
326 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
327 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
328 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
329 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
330 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
331 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
332 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
333 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
334 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
335 Heps = _mm256_mul_ps(vfeps,H);
336 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
337 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
338 velec = _mm256_mul_ps(qq00,VV);
339 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
340 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
342 /* LENNARD-JONES DISPERSION/REPULSION */
344 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
345 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
346 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
347 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
348 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
350 /* Update potential sum for this i atom from the interaction with this j atom. */
351 velecsum = _mm256_add_ps(velecsum,velec);
352 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
354 fscal = _mm256_add_ps(felec,fvdw);
356 /* Calculate temporary vectorial force */
357 tx = _mm256_mul_ps(fscal,dx00);
358 ty = _mm256_mul_ps(fscal,dy00);
359 tz = _mm256_mul_ps(fscal,dz00);
361 /* Update vectorial force */
362 fix0 = _mm256_add_ps(fix0,tx);
363 fiy0 = _mm256_add_ps(fiy0,ty);
364 fiz0 = _mm256_add_ps(fiz0,tz);
366 fjx0 = _mm256_add_ps(fjx0,tx);
367 fjy0 = _mm256_add_ps(fjy0,ty);
368 fjz0 = _mm256_add_ps(fjz0,tz);
370 /**************************
371 * CALCULATE INTERACTIONS *
372 **************************/
374 r01 = _mm256_mul_ps(rsq01,rinv01);
376 /* Calculate table index by multiplying r with table scale and truncate to integer */
377 rt = _mm256_mul_ps(r01,vftabscale);
378 vfitab = _mm256_cvttps_epi32(rt);
379 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
380 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
381 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
382 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
383 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
384 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
386 /* CUBIC SPLINE TABLE ELECTROSTATICS */
387 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
388 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
389 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
390 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
391 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
392 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
393 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
394 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
395 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
396 Heps = _mm256_mul_ps(vfeps,H);
397 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
398 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
399 velec = _mm256_mul_ps(qq01,VV);
400 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
401 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
403 /* Update potential sum for this i atom from the interaction with this j atom. */
404 velecsum = _mm256_add_ps(velecsum,velec);
408 /* Calculate temporary vectorial force */
409 tx = _mm256_mul_ps(fscal,dx01);
410 ty = _mm256_mul_ps(fscal,dy01);
411 tz = _mm256_mul_ps(fscal,dz01);
413 /* Update vectorial force */
414 fix0 = _mm256_add_ps(fix0,tx);
415 fiy0 = _mm256_add_ps(fiy0,ty);
416 fiz0 = _mm256_add_ps(fiz0,tz);
418 fjx1 = _mm256_add_ps(fjx1,tx);
419 fjy1 = _mm256_add_ps(fjy1,ty);
420 fjz1 = _mm256_add_ps(fjz1,tz);
422 /**************************
423 * CALCULATE INTERACTIONS *
424 **************************/
426 r02 = _mm256_mul_ps(rsq02,rinv02);
428 /* Calculate table index by multiplying r with table scale and truncate to integer */
429 rt = _mm256_mul_ps(r02,vftabscale);
430 vfitab = _mm256_cvttps_epi32(rt);
431 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
432 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
433 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
434 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
435 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
436 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
438 /* CUBIC SPLINE TABLE ELECTROSTATICS */
439 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
440 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
441 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
442 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
443 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
444 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
445 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
446 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
447 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
448 Heps = _mm256_mul_ps(vfeps,H);
449 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
450 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
451 velec = _mm256_mul_ps(qq02,VV);
452 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
453 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
455 /* Update potential sum for this i atom from the interaction with this j atom. */
456 velecsum = _mm256_add_ps(velecsum,velec);
460 /* Calculate temporary vectorial force */
461 tx = _mm256_mul_ps(fscal,dx02);
462 ty = _mm256_mul_ps(fscal,dy02);
463 tz = _mm256_mul_ps(fscal,dz02);
465 /* Update vectorial force */
466 fix0 = _mm256_add_ps(fix0,tx);
467 fiy0 = _mm256_add_ps(fiy0,ty);
468 fiz0 = _mm256_add_ps(fiz0,tz);
470 fjx2 = _mm256_add_ps(fjx2,tx);
471 fjy2 = _mm256_add_ps(fjy2,ty);
472 fjz2 = _mm256_add_ps(fjz2,tz);
474 /**************************
475 * CALCULATE INTERACTIONS *
476 **************************/
478 r10 = _mm256_mul_ps(rsq10,rinv10);
480 /* Calculate table index by multiplying r with table scale and truncate to integer */
481 rt = _mm256_mul_ps(r10,vftabscale);
482 vfitab = _mm256_cvttps_epi32(rt);
483 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
484 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
485 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
486 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
487 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
488 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
490 /* CUBIC SPLINE TABLE ELECTROSTATICS */
491 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
492 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
493 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
494 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
495 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
496 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
497 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
498 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
499 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
500 Heps = _mm256_mul_ps(vfeps,H);
501 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
502 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
503 velec = _mm256_mul_ps(qq10,VV);
504 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
505 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
507 /* Update potential sum for this i atom from the interaction with this j atom. */
508 velecsum = _mm256_add_ps(velecsum,velec);
512 /* Calculate temporary vectorial force */
513 tx = _mm256_mul_ps(fscal,dx10);
514 ty = _mm256_mul_ps(fscal,dy10);
515 tz = _mm256_mul_ps(fscal,dz10);
517 /* Update vectorial force */
518 fix1 = _mm256_add_ps(fix1,tx);
519 fiy1 = _mm256_add_ps(fiy1,ty);
520 fiz1 = _mm256_add_ps(fiz1,tz);
522 fjx0 = _mm256_add_ps(fjx0,tx);
523 fjy0 = _mm256_add_ps(fjy0,ty);
524 fjz0 = _mm256_add_ps(fjz0,tz);
526 /**************************
527 * CALCULATE INTERACTIONS *
528 **************************/
530 r11 = _mm256_mul_ps(rsq11,rinv11);
532 /* Calculate table index by multiplying r with table scale and truncate to integer */
533 rt = _mm256_mul_ps(r11,vftabscale);
534 vfitab = _mm256_cvttps_epi32(rt);
535 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
536 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
537 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
538 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
539 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
540 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
542 /* CUBIC SPLINE TABLE ELECTROSTATICS */
543 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
544 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
545 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
546 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
547 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
548 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
549 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
550 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
551 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
552 Heps = _mm256_mul_ps(vfeps,H);
553 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
554 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
555 velec = _mm256_mul_ps(qq11,VV);
556 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
557 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
559 /* Update potential sum for this i atom from the interaction with this j atom. */
560 velecsum = _mm256_add_ps(velecsum,velec);
564 /* Calculate temporary vectorial force */
565 tx = _mm256_mul_ps(fscal,dx11);
566 ty = _mm256_mul_ps(fscal,dy11);
567 tz = _mm256_mul_ps(fscal,dz11);
569 /* Update vectorial force */
570 fix1 = _mm256_add_ps(fix1,tx);
571 fiy1 = _mm256_add_ps(fiy1,ty);
572 fiz1 = _mm256_add_ps(fiz1,tz);
574 fjx1 = _mm256_add_ps(fjx1,tx);
575 fjy1 = _mm256_add_ps(fjy1,ty);
576 fjz1 = _mm256_add_ps(fjz1,tz);
578 /**************************
579 * CALCULATE INTERACTIONS *
580 **************************/
582 r12 = _mm256_mul_ps(rsq12,rinv12);
584 /* Calculate table index by multiplying r with table scale and truncate to integer */
585 rt = _mm256_mul_ps(r12,vftabscale);
586 vfitab = _mm256_cvttps_epi32(rt);
587 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
588 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
589 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
590 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
591 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
592 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
594 /* CUBIC SPLINE TABLE ELECTROSTATICS */
595 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
596 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
597 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
598 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
599 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
600 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
601 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
602 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
603 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
604 Heps = _mm256_mul_ps(vfeps,H);
605 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
606 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
607 velec = _mm256_mul_ps(qq12,VV);
608 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
609 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
611 /* Update potential sum for this i atom from the interaction with this j atom. */
612 velecsum = _mm256_add_ps(velecsum,velec);
616 /* Calculate temporary vectorial force */
617 tx = _mm256_mul_ps(fscal,dx12);
618 ty = _mm256_mul_ps(fscal,dy12);
619 tz = _mm256_mul_ps(fscal,dz12);
621 /* Update vectorial force */
622 fix1 = _mm256_add_ps(fix1,tx);
623 fiy1 = _mm256_add_ps(fiy1,ty);
624 fiz1 = _mm256_add_ps(fiz1,tz);
626 fjx2 = _mm256_add_ps(fjx2,tx);
627 fjy2 = _mm256_add_ps(fjy2,ty);
628 fjz2 = _mm256_add_ps(fjz2,tz);
630 /**************************
631 * CALCULATE INTERACTIONS *
632 **************************/
634 r20 = _mm256_mul_ps(rsq20,rinv20);
636 /* Calculate table index by multiplying r with table scale and truncate to integer */
637 rt = _mm256_mul_ps(r20,vftabscale);
638 vfitab = _mm256_cvttps_epi32(rt);
639 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
640 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
641 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
642 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
643 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
644 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
646 /* CUBIC SPLINE TABLE ELECTROSTATICS */
647 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
648 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
649 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
650 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
651 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
652 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
653 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
654 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
655 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
656 Heps = _mm256_mul_ps(vfeps,H);
657 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
658 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
659 velec = _mm256_mul_ps(qq20,VV);
660 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
661 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
663 /* Update potential sum for this i atom from the interaction with this j atom. */
664 velecsum = _mm256_add_ps(velecsum,velec);
668 /* Calculate temporary vectorial force */
669 tx = _mm256_mul_ps(fscal,dx20);
670 ty = _mm256_mul_ps(fscal,dy20);
671 tz = _mm256_mul_ps(fscal,dz20);
673 /* Update vectorial force */
674 fix2 = _mm256_add_ps(fix2,tx);
675 fiy2 = _mm256_add_ps(fiy2,ty);
676 fiz2 = _mm256_add_ps(fiz2,tz);
678 fjx0 = _mm256_add_ps(fjx0,tx);
679 fjy0 = _mm256_add_ps(fjy0,ty);
680 fjz0 = _mm256_add_ps(fjz0,tz);
682 /**************************
683 * CALCULATE INTERACTIONS *
684 **************************/
686 r21 = _mm256_mul_ps(rsq21,rinv21);
688 /* Calculate table index by multiplying r with table scale and truncate to integer */
689 rt = _mm256_mul_ps(r21,vftabscale);
690 vfitab = _mm256_cvttps_epi32(rt);
691 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
692 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
693 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
694 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
695 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
696 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
698 /* CUBIC SPLINE TABLE ELECTROSTATICS */
699 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
700 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
701 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
702 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
703 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
704 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
705 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
706 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
707 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
708 Heps = _mm256_mul_ps(vfeps,H);
709 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
710 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
711 velec = _mm256_mul_ps(qq21,VV);
712 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
713 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
715 /* Update potential sum for this i atom from the interaction with this j atom. */
716 velecsum = _mm256_add_ps(velecsum,velec);
720 /* Calculate temporary vectorial force */
721 tx = _mm256_mul_ps(fscal,dx21);
722 ty = _mm256_mul_ps(fscal,dy21);
723 tz = _mm256_mul_ps(fscal,dz21);
725 /* Update vectorial force */
726 fix2 = _mm256_add_ps(fix2,tx);
727 fiy2 = _mm256_add_ps(fiy2,ty);
728 fiz2 = _mm256_add_ps(fiz2,tz);
730 fjx1 = _mm256_add_ps(fjx1,tx);
731 fjy1 = _mm256_add_ps(fjy1,ty);
732 fjz1 = _mm256_add_ps(fjz1,tz);
734 /**************************
735 * CALCULATE INTERACTIONS *
736 **************************/
738 r22 = _mm256_mul_ps(rsq22,rinv22);
740 /* Calculate table index by multiplying r with table scale and truncate to integer */
741 rt = _mm256_mul_ps(r22,vftabscale);
742 vfitab = _mm256_cvttps_epi32(rt);
743 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
744 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
745 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
746 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
747 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
748 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
750 /* CUBIC SPLINE TABLE ELECTROSTATICS */
751 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
752 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
753 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
754 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
755 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
756 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
757 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
758 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
759 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
760 Heps = _mm256_mul_ps(vfeps,H);
761 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
762 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
763 velec = _mm256_mul_ps(qq22,VV);
764 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
765 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
767 /* Update potential sum for this i atom from the interaction with this j atom. */
768 velecsum = _mm256_add_ps(velecsum,velec);
772 /* Calculate temporary vectorial force */
773 tx = _mm256_mul_ps(fscal,dx22);
774 ty = _mm256_mul_ps(fscal,dy22);
775 tz = _mm256_mul_ps(fscal,dz22);
777 /* Update vectorial force */
778 fix2 = _mm256_add_ps(fix2,tx);
779 fiy2 = _mm256_add_ps(fiy2,ty);
780 fiz2 = _mm256_add_ps(fiz2,tz);
782 fjx2 = _mm256_add_ps(fjx2,tx);
783 fjy2 = _mm256_add_ps(fjy2,ty);
784 fjz2 = _mm256_add_ps(fjz2,tz);
786 fjptrA = f+j_coord_offsetA;
787 fjptrB = f+j_coord_offsetB;
788 fjptrC = f+j_coord_offsetC;
789 fjptrD = f+j_coord_offsetD;
790 fjptrE = f+j_coord_offsetE;
791 fjptrF = f+j_coord_offsetF;
792 fjptrG = f+j_coord_offsetG;
793 fjptrH = f+j_coord_offsetH;
795 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
796 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
798 /* Inner loop uses 400 flops */
804 /* Get j neighbor index, and coordinate index */
805 jnrlistA = jjnr[jidx];
806 jnrlistB = jjnr[jidx+1];
807 jnrlistC = jjnr[jidx+2];
808 jnrlistD = jjnr[jidx+3];
809 jnrlistE = jjnr[jidx+4];
810 jnrlistF = jjnr[jidx+5];
811 jnrlistG = jjnr[jidx+6];
812 jnrlistH = jjnr[jidx+7];
813 /* Sign of each element will be negative for non-real atoms.
814 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
815 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
817 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
818 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
820 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
821 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
822 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
823 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
824 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
825 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
826 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
827 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
828 j_coord_offsetA = DIM*jnrA;
829 j_coord_offsetB = DIM*jnrB;
830 j_coord_offsetC = DIM*jnrC;
831 j_coord_offsetD = DIM*jnrD;
832 j_coord_offsetE = DIM*jnrE;
833 j_coord_offsetF = DIM*jnrF;
834 j_coord_offsetG = DIM*jnrG;
835 j_coord_offsetH = DIM*jnrH;
837 /* load j atom coordinates */
838 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
839 x+j_coord_offsetC,x+j_coord_offsetD,
840 x+j_coord_offsetE,x+j_coord_offsetF,
841 x+j_coord_offsetG,x+j_coord_offsetH,
842 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
844 /* Calculate displacement vector */
845 dx00 = _mm256_sub_ps(ix0,jx0);
846 dy00 = _mm256_sub_ps(iy0,jy0);
847 dz00 = _mm256_sub_ps(iz0,jz0);
848 dx01 = _mm256_sub_ps(ix0,jx1);
849 dy01 = _mm256_sub_ps(iy0,jy1);
850 dz01 = _mm256_sub_ps(iz0,jz1);
851 dx02 = _mm256_sub_ps(ix0,jx2);
852 dy02 = _mm256_sub_ps(iy0,jy2);
853 dz02 = _mm256_sub_ps(iz0,jz2);
854 dx10 = _mm256_sub_ps(ix1,jx0);
855 dy10 = _mm256_sub_ps(iy1,jy0);
856 dz10 = _mm256_sub_ps(iz1,jz0);
857 dx11 = _mm256_sub_ps(ix1,jx1);
858 dy11 = _mm256_sub_ps(iy1,jy1);
859 dz11 = _mm256_sub_ps(iz1,jz1);
860 dx12 = _mm256_sub_ps(ix1,jx2);
861 dy12 = _mm256_sub_ps(iy1,jy2);
862 dz12 = _mm256_sub_ps(iz1,jz2);
863 dx20 = _mm256_sub_ps(ix2,jx0);
864 dy20 = _mm256_sub_ps(iy2,jy0);
865 dz20 = _mm256_sub_ps(iz2,jz0);
866 dx21 = _mm256_sub_ps(ix2,jx1);
867 dy21 = _mm256_sub_ps(iy2,jy1);
868 dz21 = _mm256_sub_ps(iz2,jz1);
869 dx22 = _mm256_sub_ps(ix2,jx2);
870 dy22 = _mm256_sub_ps(iy2,jy2);
871 dz22 = _mm256_sub_ps(iz2,jz2);
873 /* Calculate squared distance and things based on it */
874 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
875 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
876 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
877 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
878 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
879 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
880 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
881 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
882 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
884 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
885 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
886 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
887 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
888 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
889 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
890 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
891 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
892 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
894 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
896 fjx0 = _mm256_setzero_ps();
897 fjy0 = _mm256_setzero_ps();
898 fjz0 = _mm256_setzero_ps();
899 fjx1 = _mm256_setzero_ps();
900 fjy1 = _mm256_setzero_ps();
901 fjz1 = _mm256_setzero_ps();
902 fjx2 = _mm256_setzero_ps();
903 fjy2 = _mm256_setzero_ps();
904 fjz2 = _mm256_setzero_ps();
906 /**************************
907 * CALCULATE INTERACTIONS *
908 **************************/
910 r00 = _mm256_mul_ps(rsq00,rinv00);
911 r00 = _mm256_andnot_ps(dummy_mask,r00);
913 /* Calculate table index by multiplying r with table scale and truncate to integer */
914 rt = _mm256_mul_ps(r00,vftabscale);
915 vfitab = _mm256_cvttps_epi32(rt);
916 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
917 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
918 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
919 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
920 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
921 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
923 /* CUBIC SPLINE TABLE ELECTROSTATICS */
924 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
925 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
926 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
927 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
928 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
929 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
930 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
931 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
932 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
933 Heps = _mm256_mul_ps(vfeps,H);
934 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
935 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
936 velec = _mm256_mul_ps(qq00,VV);
937 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
938 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
940 /* LENNARD-JONES DISPERSION/REPULSION */
942 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
943 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
944 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
945 vvdw = _mm256_sub_ps( _mm256_mul_ps(vvdw12,one_twelfth) , _mm256_mul_ps(vvdw6,one_sixth) );
946 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
948 /* Update potential sum for this i atom from the interaction with this j atom. */
949 velec = _mm256_andnot_ps(dummy_mask,velec);
950 velecsum = _mm256_add_ps(velecsum,velec);
951 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
952 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
954 fscal = _mm256_add_ps(felec,fvdw);
956 fscal = _mm256_andnot_ps(dummy_mask,fscal);
958 /* Calculate temporary vectorial force */
959 tx = _mm256_mul_ps(fscal,dx00);
960 ty = _mm256_mul_ps(fscal,dy00);
961 tz = _mm256_mul_ps(fscal,dz00);
963 /* Update vectorial force */
964 fix0 = _mm256_add_ps(fix0,tx);
965 fiy0 = _mm256_add_ps(fiy0,ty);
966 fiz0 = _mm256_add_ps(fiz0,tz);
968 fjx0 = _mm256_add_ps(fjx0,tx);
969 fjy0 = _mm256_add_ps(fjy0,ty);
970 fjz0 = _mm256_add_ps(fjz0,tz);
972 /**************************
973 * CALCULATE INTERACTIONS *
974 **************************/
976 r01 = _mm256_mul_ps(rsq01,rinv01);
977 r01 = _mm256_andnot_ps(dummy_mask,r01);
979 /* Calculate table index by multiplying r with table scale and truncate to integer */
980 rt = _mm256_mul_ps(r01,vftabscale);
981 vfitab = _mm256_cvttps_epi32(rt);
982 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
983 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
984 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
985 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
986 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
987 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
989 /* CUBIC SPLINE TABLE ELECTROSTATICS */
990 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
991 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
992 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
993 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
994 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
995 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
996 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
997 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
998 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
999 Heps = _mm256_mul_ps(vfeps,H);
1000 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1001 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1002 velec = _mm256_mul_ps(qq01,VV);
1003 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1004 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1006 /* Update potential sum for this i atom from the interaction with this j atom. */
1007 velec = _mm256_andnot_ps(dummy_mask,velec);
1008 velecsum = _mm256_add_ps(velecsum,velec);
1012 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1014 /* Calculate temporary vectorial force */
1015 tx = _mm256_mul_ps(fscal,dx01);
1016 ty = _mm256_mul_ps(fscal,dy01);
1017 tz = _mm256_mul_ps(fscal,dz01);
1019 /* Update vectorial force */
1020 fix0 = _mm256_add_ps(fix0,tx);
1021 fiy0 = _mm256_add_ps(fiy0,ty);
1022 fiz0 = _mm256_add_ps(fiz0,tz);
1024 fjx1 = _mm256_add_ps(fjx1,tx);
1025 fjy1 = _mm256_add_ps(fjy1,ty);
1026 fjz1 = _mm256_add_ps(fjz1,tz);
1028 /**************************
1029 * CALCULATE INTERACTIONS *
1030 **************************/
1032 r02 = _mm256_mul_ps(rsq02,rinv02);
1033 r02 = _mm256_andnot_ps(dummy_mask,r02);
1035 /* Calculate table index by multiplying r with table scale and truncate to integer */
1036 rt = _mm256_mul_ps(r02,vftabscale);
1037 vfitab = _mm256_cvttps_epi32(rt);
1038 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1039 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1040 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1041 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1042 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1043 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1045 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1046 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1047 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1048 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1049 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1050 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1051 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1052 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1053 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1054 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1055 Heps = _mm256_mul_ps(vfeps,H);
1056 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1057 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1058 velec = _mm256_mul_ps(qq02,VV);
1059 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1060 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1062 /* Update potential sum for this i atom from the interaction with this j atom. */
1063 velec = _mm256_andnot_ps(dummy_mask,velec);
1064 velecsum = _mm256_add_ps(velecsum,velec);
1068 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1070 /* Calculate temporary vectorial force */
1071 tx = _mm256_mul_ps(fscal,dx02);
1072 ty = _mm256_mul_ps(fscal,dy02);
1073 tz = _mm256_mul_ps(fscal,dz02);
1075 /* Update vectorial force */
1076 fix0 = _mm256_add_ps(fix0,tx);
1077 fiy0 = _mm256_add_ps(fiy0,ty);
1078 fiz0 = _mm256_add_ps(fiz0,tz);
1080 fjx2 = _mm256_add_ps(fjx2,tx);
1081 fjy2 = _mm256_add_ps(fjy2,ty);
1082 fjz2 = _mm256_add_ps(fjz2,tz);
1084 /**************************
1085 * CALCULATE INTERACTIONS *
1086 **************************/
1088 r10 = _mm256_mul_ps(rsq10,rinv10);
1089 r10 = _mm256_andnot_ps(dummy_mask,r10);
1091 /* Calculate table index by multiplying r with table scale and truncate to integer */
1092 rt = _mm256_mul_ps(r10,vftabscale);
1093 vfitab = _mm256_cvttps_epi32(rt);
1094 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1095 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1096 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1097 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1098 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1099 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1101 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1102 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1103 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1104 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1105 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1106 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1107 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1108 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1109 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1110 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1111 Heps = _mm256_mul_ps(vfeps,H);
1112 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1113 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1114 velec = _mm256_mul_ps(qq10,VV);
1115 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1116 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1118 /* Update potential sum for this i atom from the interaction with this j atom. */
1119 velec = _mm256_andnot_ps(dummy_mask,velec);
1120 velecsum = _mm256_add_ps(velecsum,velec);
1124 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1126 /* Calculate temporary vectorial force */
1127 tx = _mm256_mul_ps(fscal,dx10);
1128 ty = _mm256_mul_ps(fscal,dy10);
1129 tz = _mm256_mul_ps(fscal,dz10);
1131 /* Update vectorial force */
1132 fix1 = _mm256_add_ps(fix1,tx);
1133 fiy1 = _mm256_add_ps(fiy1,ty);
1134 fiz1 = _mm256_add_ps(fiz1,tz);
1136 fjx0 = _mm256_add_ps(fjx0,tx);
1137 fjy0 = _mm256_add_ps(fjy0,ty);
1138 fjz0 = _mm256_add_ps(fjz0,tz);
1140 /**************************
1141 * CALCULATE INTERACTIONS *
1142 **************************/
1144 r11 = _mm256_mul_ps(rsq11,rinv11);
1145 r11 = _mm256_andnot_ps(dummy_mask,r11);
1147 /* Calculate table index by multiplying r with table scale and truncate to integer */
1148 rt = _mm256_mul_ps(r11,vftabscale);
1149 vfitab = _mm256_cvttps_epi32(rt);
1150 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1151 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1152 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1153 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1154 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1155 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1157 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1158 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1159 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1160 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1161 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1162 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1163 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1164 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1165 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1166 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1167 Heps = _mm256_mul_ps(vfeps,H);
1168 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1169 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1170 velec = _mm256_mul_ps(qq11,VV);
1171 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1172 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1174 /* Update potential sum for this i atom from the interaction with this j atom. */
1175 velec = _mm256_andnot_ps(dummy_mask,velec);
1176 velecsum = _mm256_add_ps(velecsum,velec);
1180 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1182 /* Calculate temporary vectorial force */
1183 tx = _mm256_mul_ps(fscal,dx11);
1184 ty = _mm256_mul_ps(fscal,dy11);
1185 tz = _mm256_mul_ps(fscal,dz11);
1187 /* Update vectorial force */
1188 fix1 = _mm256_add_ps(fix1,tx);
1189 fiy1 = _mm256_add_ps(fiy1,ty);
1190 fiz1 = _mm256_add_ps(fiz1,tz);
1192 fjx1 = _mm256_add_ps(fjx1,tx);
1193 fjy1 = _mm256_add_ps(fjy1,ty);
1194 fjz1 = _mm256_add_ps(fjz1,tz);
1196 /**************************
1197 * CALCULATE INTERACTIONS *
1198 **************************/
1200 r12 = _mm256_mul_ps(rsq12,rinv12);
1201 r12 = _mm256_andnot_ps(dummy_mask,r12);
1203 /* Calculate table index by multiplying r with table scale and truncate to integer */
1204 rt = _mm256_mul_ps(r12,vftabscale);
1205 vfitab = _mm256_cvttps_epi32(rt);
1206 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1207 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1208 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1209 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1210 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1211 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1213 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1214 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1215 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1216 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1217 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1218 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1219 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1220 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1221 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1222 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1223 Heps = _mm256_mul_ps(vfeps,H);
1224 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1225 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1226 velec = _mm256_mul_ps(qq12,VV);
1227 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1228 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1230 /* Update potential sum for this i atom from the interaction with this j atom. */
1231 velec = _mm256_andnot_ps(dummy_mask,velec);
1232 velecsum = _mm256_add_ps(velecsum,velec);
1236 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1238 /* Calculate temporary vectorial force */
1239 tx = _mm256_mul_ps(fscal,dx12);
1240 ty = _mm256_mul_ps(fscal,dy12);
1241 tz = _mm256_mul_ps(fscal,dz12);
1243 /* Update vectorial force */
1244 fix1 = _mm256_add_ps(fix1,tx);
1245 fiy1 = _mm256_add_ps(fiy1,ty);
1246 fiz1 = _mm256_add_ps(fiz1,tz);
1248 fjx2 = _mm256_add_ps(fjx2,tx);
1249 fjy2 = _mm256_add_ps(fjy2,ty);
1250 fjz2 = _mm256_add_ps(fjz2,tz);
1252 /**************************
1253 * CALCULATE INTERACTIONS *
1254 **************************/
1256 r20 = _mm256_mul_ps(rsq20,rinv20);
1257 r20 = _mm256_andnot_ps(dummy_mask,r20);
1259 /* Calculate table index by multiplying r with table scale and truncate to integer */
1260 rt = _mm256_mul_ps(r20,vftabscale);
1261 vfitab = _mm256_cvttps_epi32(rt);
1262 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1263 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1264 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1265 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1266 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1267 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1269 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1270 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1271 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1272 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1273 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1274 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1275 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1276 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1277 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1278 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1279 Heps = _mm256_mul_ps(vfeps,H);
1280 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1281 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1282 velec = _mm256_mul_ps(qq20,VV);
1283 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1284 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
1286 /* Update potential sum for this i atom from the interaction with this j atom. */
1287 velec = _mm256_andnot_ps(dummy_mask,velec);
1288 velecsum = _mm256_add_ps(velecsum,velec);
1292 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1294 /* Calculate temporary vectorial force */
1295 tx = _mm256_mul_ps(fscal,dx20);
1296 ty = _mm256_mul_ps(fscal,dy20);
1297 tz = _mm256_mul_ps(fscal,dz20);
1299 /* Update vectorial force */
1300 fix2 = _mm256_add_ps(fix2,tx);
1301 fiy2 = _mm256_add_ps(fiy2,ty);
1302 fiz2 = _mm256_add_ps(fiz2,tz);
1304 fjx0 = _mm256_add_ps(fjx0,tx);
1305 fjy0 = _mm256_add_ps(fjy0,ty);
1306 fjz0 = _mm256_add_ps(fjz0,tz);
1308 /**************************
1309 * CALCULATE INTERACTIONS *
1310 **************************/
1312 r21 = _mm256_mul_ps(rsq21,rinv21);
1313 r21 = _mm256_andnot_ps(dummy_mask,r21);
1315 /* Calculate table index by multiplying r with table scale and truncate to integer */
1316 rt = _mm256_mul_ps(r21,vftabscale);
1317 vfitab = _mm256_cvttps_epi32(rt);
1318 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1319 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1320 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1321 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1322 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1323 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1325 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1326 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1327 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1328 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1329 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1330 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1331 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1332 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1333 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1334 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1335 Heps = _mm256_mul_ps(vfeps,H);
1336 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1337 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1338 velec = _mm256_mul_ps(qq21,VV);
1339 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1340 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
1342 /* Update potential sum for this i atom from the interaction with this j atom. */
1343 velec = _mm256_andnot_ps(dummy_mask,velec);
1344 velecsum = _mm256_add_ps(velecsum,velec);
1348 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1350 /* Calculate temporary vectorial force */
1351 tx = _mm256_mul_ps(fscal,dx21);
1352 ty = _mm256_mul_ps(fscal,dy21);
1353 tz = _mm256_mul_ps(fscal,dz21);
1355 /* Update vectorial force */
1356 fix2 = _mm256_add_ps(fix2,tx);
1357 fiy2 = _mm256_add_ps(fiy2,ty);
1358 fiz2 = _mm256_add_ps(fiz2,tz);
1360 fjx1 = _mm256_add_ps(fjx1,tx);
1361 fjy1 = _mm256_add_ps(fjy1,ty);
1362 fjz1 = _mm256_add_ps(fjz1,tz);
1364 /**************************
1365 * CALCULATE INTERACTIONS *
1366 **************************/
1368 r22 = _mm256_mul_ps(rsq22,rinv22);
1369 r22 = _mm256_andnot_ps(dummy_mask,r22);
1371 /* Calculate table index by multiplying r with table scale and truncate to integer */
1372 rt = _mm256_mul_ps(r22,vftabscale);
1373 vfitab = _mm256_cvttps_epi32(rt);
1374 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1375 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1376 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1377 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1378 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1379 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1381 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1382 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1383 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1384 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1385 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1386 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1387 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1388 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1389 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1390 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1391 Heps = _mm256_mul_ps(vfeps,H);
1392 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1393 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
1394 velec = _mm256_mul_ps(qq22,VV);
1395 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1396 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
1398 /* Update potential sum for this i atom from the interaction with this j atom. */
1399 velec = _mm256_andnot_ps(dummy_mask,velec);
1400 velecsum = _mm256_add_ps(velecsum,velec);
1404 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1406 /* Calculate temporary vectorial force */
1407 tx = _mm256_mul_ps(fscal,dx22);
1408 ty = _mm256_mul_ps(fscal,dy22);
1409 tz = _mm256_mul_ps(fscal,dz22);
1411 /* Update vectorial force */
1412 fix2 = _mm256_add_ps(fix2,tx);
1413 fiy2 = _mm256_add_ps(fiy2,ty);
1414 fiz2 = _mm256_add_ps(fiz2,tz);
1416 fjx2 = _mm256_add_ps(fjx2,tx);
1417 fjy2 = _mm256_add_ps(fjy2,ty);
1418 fjz2 = _mm256_add_ps(fjz2,tz);
1420 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1421 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1422 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1423 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1424 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1425 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1426 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1427 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1429 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1430 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1432 /* Inner loop uses 409 flops */
1435 /* End of innermost loop */
1437 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1438 f+i_coord_offset,fshift+i_shift_offset);
1441 /* Update potential energies */
1442 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1443 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1445 /* Increment number of inner iterations */
1446 inneriter += j_index_end - j_index_start;
1448 /* Outer loop uses 20 flops */
1451 /* Increment number of outer iterations */
1454 /* Update outer/inner flops */
1456 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*409);
1459 * Gromacs nonbonded kernel: nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single
1460 * Electrostatics interaction: CubicSplineTable
1461 * VdW interaction: LennardJones
1462 * Geometry: Water3-Water3
1463 * Calculate force/pot: Force
1466 nb_kernel_ElecCSTab_VdwLJ_GeomW3W3_F_avx_256_single
1467 (t_nblist * gmx_restrict nlist,
1468 rvec * gmx_restrict xx,
1469 rvec * gmx_restrict ff,
1470 t_forcerec * gmx_restrict fr,
1471 t_mdatoms * gmx_restrict mdatoms,
1472 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1473 t_nrnb * gmx_restrict nrnb)
1475 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1476 * just 0 for non-waters.
1477 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1478 * jnr indices corresponding to data put in the four positions in the SIMD register.
1480 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1481 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1482 int jnrA,jnrB,jnrC,jnrD;
1483 int jnrE,jnrF,jnrG,jnrH;
1484 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1485 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1486 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1487 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1488 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1489 real rcutoff_scalar;
1490 real *shiftvec,*fshift,*x,*f;
1491 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1492 real scratch[4*DIM];
1493 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1494 real * vdwioffsetptr0;
1495 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1496 real * vdwioffsetptr1;
1497 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1498 real * vdwioffsetptr2;
1499 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1500 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1501 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1502 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1503 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1504 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1505 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1506 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1507 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1508 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1509 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1510 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1511 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1512 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1513 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1514 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1515 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1518 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1521 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1522 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1524 __m128i vfitab_lo,vfitab_hi;
1525 __m128i ifour = _mm_set1_epi32(4);
1526 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1528 __m256 dummy_mask,cutoff_mask;
1529 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1530 __m256 one = _mm256_set1_ps(1.0);
1531 __m256 two = _mm256_set1_ps(2.0);
1537 jindex = nlist->jindex;
1539 shiftidx = nlist->shift;
1541 shiftvec = fr->shift_vec[0];
1542 fshift = fr->fshift[0];
1543 facel = _mm256_set1_ps(fr->epsfac);
1544 charge = mdatoms->chargeA;
1545 nvdwtype = fr->ntype;
1546 vdwparam = fr->nbfp;
1547 vdwtype = mdatoms->typeA;
1549 vftab = kernel_data->table_elec->data;
1550 vftabscale = _mm256_set1_ps(kernel_data->table_elec->scale);
1552 /* Setup water-specific parameters */
1553 inr = nlist->iinr[0];
1554 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1555 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1556 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1557 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1559 jq0 = _mm256_set1_ps(charge[inr+0]);
1560 jq1 = _mm256_set1_ps(charge[inr+1]);
1561 jq2 = _mm256_set1_ps(charge[inr+2]);
1562 vdwjidx0A = 2*vdwtype[inr+0];
1563 qq00 = _mm256_mul_ps(iq0,jq0);
1564 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1565 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1566 qq01 = _mm256_mul_ps(iq0,jq1);
1567 qq02 = _mm256_mul_ps(iq0,jq2);
1568 qq10 = _mm256_mul_ps(iq1,jq0);
1569 qq11 = _mm256_mul_ps(iq1,jq1);
1570 qq12 = _mm256_mul_ps(iq1,jq2);
1571 qq20 = _mm256_mul_ps(iq2,jq0);
1572 qq21 = _mm256_mul_ps(iq2,jq1);
1573 qq22 = _mm256_mul_ps(iq2,jq2);
1575 /* Avoid stupid compiler warnings */
1576 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1577 j_coord_offsetA = 0;
1578 j_coord_offsetB = 0;
1579 j_coord_offsetC = 0;
1580 j_coord_offsetD = 0;
1581 j_coord_offsetE = 0;
1582 j_coord_offsetF = 0;
1583 j_coord_offsetG = 0;
1584 j_coord_offsetH = 0;
1589 for(iidx=0;iidx<4*DIM;iidx++)
1591 scratch[iidx] = 0.0;
1594 /* Start outer loop over neighborlists */
1595 for(iidx=0; iidx<nri; iidx++)
1597 /* Load shift vector for this list */
1598 i_shift_offset = DIM*shiftidx[iidx];
1600 /* Load limits for loop over neighbors */
1601 j_index_start = jindex[iidx];
1602 j_index_end = jindex[iidx+1];
1604 /* Get outer coordinate index */
1606 i_coord_offset = DIM*inr;
1608 /* Load i particle coords and add shift vector */
1609 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1610 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1612 fix0 = _mm256_setzero_ps();
1613 fiy0 = _mm256_setzero_ps();
1614 fiz0 = _mm256_setzero_ps();
1615 fix1 = _mm256_setzero_ps();
1616 fiy1 = _mm256_setzero_ps();
1617 fiz1 = _mm256_setzero_ps();
1618 fix2 = _mm256_setzero_ps();
1619 fiy2 = _mm256_setzero_ps();
1620 fiz2 = _mm256_setzero_ps();
1622 /* Start inner kernel loop */
1623 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1626 /* Get j neighbor index, and coordinate index */
1628 jnrB = jjnr[jidx+1];
1629 jnrC = jjnr[jidx+2];
1630 jnrD = jjnr[jidx+3];
1631 jnrE = jjnr[jidx+4];
1632 jnrF = jjnr[jidx+5];
1633 jnrG = jjnr[jidx+6];
1634 jnrH = jjnr[jidx+7];
1635 j_coord_offsetA = DIM*jnrA;
1636 j_coord_offsetB = DIM*jnrB;
1637 j_coord_offsetC = DIM*jnrC;
1638 j_coord_offsetD = DIM*jnrD;
1639 j_coord_offsetE = DIM*jnrE;
1640 j_coord_offsetF = DIM*jnrF;
1641 j_coord_offsetG = DIM*jnrG;
1642 j_coord_offsetH = DIM*jnrH;
1644 /* load j atom coordinates */
1645 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1646 x+j_coord_offsetC,x+j_coord_offsetD,
1647 x+j_coord_offsetE,x+j_coord_offsetF,
1648 x+j_coord_offsetG,x+j_coord_offsetH,
1649 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1651 /* Calculate displacement vector */
1652 dx00 = _mm256_sub_ps(ix0,jx0);
1653 dy00 = _mm256_sub_ps(iy0,jy0);
1654 dz00 = _mm256_sub_ps(iz0,jz0);
1655 dx01 = _mm256_sub_ps(ix0,jx1);
1656 dy01 = _mm256_sub_ps(iy0,jy1);
1657 dz01 = _mm256_sub_ps(iz0,jz1);
1658 dx02 = _mm256_sub_ps(ix0,jx2);
1659 dy02 = _mm256_sub_ps(iy0,jy2);
1660 dz02 = _mm256_sub_ps(iz0,jz2);
1661 dx10 = _mm256_sub_ps(ix1,jx0);
1662 dy10 = _mm256_sub_ps(iy1,jy0);
1663 dz10 = _mm256_sub_ps(iz1,jz0);
1664 dx11 = _mm256_sub_ps(ix1,jx1);
1665 dy11 = _mm256_sub_ps(iy1,jy1);
1666 dz11 = _mm256_sub_ps(iz1,jz1);
1667 dx12 = _mm256_sub_ps(ix1,jx2);
1668 dy12 = _mm256_sub_ps(iy1,jy2);
1669 dz12 = _mm256_sub_ps(iz1,jz2);
1670 dx20 = _mm256_sub_ps(ix2,jx0);
1671 dy20 = _mm256_sub_ps(iy2,jy0);
1672 dz20 = _mm256_sub_ps(iz2,jz0);
1673 dx21 = _mm256_sub_ps(ix2,jx1);
1674 dy21 = _mm256_sub_ps(iy2,jy1);
1675 dz21 = _mm256_sub_ps(iz2,jz1);
1676 dx22 = _mm256_sub_ps(ix2,jx2);
1677 dy22 = _mm256_sub_ps(iy2,jy2);
1678 dz22 = _mm256_sub_ps(iz2,jz2);
1680 /* Calculate squared distance and things based on it */
1681 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1682 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1683 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1684 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1685 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1686 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1687 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1688 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1689 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1691 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1692 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1693 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1694 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1695 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1696 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1697 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1698 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1699 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1701 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1703 fjx0 = _mm256_setzero_ps();
1704 fjy0 = _mm256_setzero_ps();
1705 fjz0 = _mm256_setzero_ps();
1706 fjx1 = _mm256_setzero_ps();
1707 fjy1 = _mm256_setzero_ps();
1708 fjz1 = _mm256_setzero_ps();
1709 fjx2 = _mm256_setzero_ps();
1710 fjy2 = _mm256_setzero_ps();
1711 fjz2 = _mm256_setzero_ps();
1713 /**************************
1714 * CALCULATE INTERACTIONS *
1715 **************************/
1717 r00 = _mm256_mul_ps(rsq00,rinv00);
1719 /* Calculate table index by multiplying r with table scale and truncate to integer */
1720 rt = _mm256_mul_ps(r00,vftabscale);
1721 vfitab = _mm256_cvttps_epi32(rt);
1722 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1723 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1724 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1725 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1726 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1727 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1729 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1730 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1731 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1732 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1733 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1734 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1735 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1736 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1737 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1738 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1739 Heps = _mm256_mul_ps(vfeps,H);
1740 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1741 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1742 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
1744 /* LENNARD-JONES DISPERSION/REPULSION */
1746 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1747 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1749 fscal = _mm256_add_ps(felec,fvdw);
1751 /* Calculate temporary vectorial force */
1752 tx = _mm256_mul_ps(fscal,dx00);
1753 ty = _mm256_mul_ps(fscal,dy00);
1754 tz = _mm256_mul_ps(fscal,dz00);
1756 /* Update vectorial force */
1757 fix0 = _mm256_add_ps(fix0,tx);
1758 fiy0 = _mm256_add_ps(fiy0,ty);
1759 fiz0 = _mm256_add_ps(fiz0,tz);
1761 fjx0 = _mm256_add_ps(fjx0,tx);
1762 fjy0 = _mm256_add_ps(fjy0,ty);
1763 fjz0 = _mm256_add_ps(fjz0,tz);
1765 /**************************
1766 * CALCULATE INTERACTIONS *
1767 **************************/
1769 r01 = _mm256_mul_ps(rsq01,rinv01);
1771 /* Calculate table index by multiplying r with table scale and truncate to integer */
1772 rt = _mm256_mul_ps(r01,vftabscale);
1773 vfitab = _mm256_cvttps_epi32(rt);
1774 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1775 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1776 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1777 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1778 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1779 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1781 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1782 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1783 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1784 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1785 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1786 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1787 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1788 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1789 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1790 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1791 Heps = _mm256_mul_ps(vfeps,H);
1792 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1793 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1794 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
1798 /* Calculate temporary vectorial force */
1799 tx = _mm256_mul_ps(fscal,dx01);
1800 ty = _mm256_mul_ps(fscal,dy01);
1801 tz = _mm256_mul_ps(fscal,dz01);
1803 /* Update vectorial force */
1804 fix0 = _mm256_add_ps(fix0,tx);
1805 fiy0 = _mm256_add_ps(fiy0,ty);
1806 fiz0 = _mm256_add_ps(fiz0,tz);
1808 fjx1 = _mm256_add_ps(fjx1,tx);
1809 fjy1 = _mm256_add_ps(fjy1,ty);
1810 fjz1 = _mm256_add_ps(fjz1,tz);
1812 /**************************
1813 * CALCULATE INTERACTIONS *
1814 **************************/
1816 r02 = _mm256_mul_ps(rsq02,rinv02);
1818 /* Calculate table index by multiplying r with table scale and truncate to integer */
1819 rt = _mm256_mul_ps(r02,vftabscale);
1820 vfitab = _mm256_cvttps_epi32(rt);
1821 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1822 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1823 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1824 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1825 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1826 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1828 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1829 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1830 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1831 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1832 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1833 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1834 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1835 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1836 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1837 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1838 Heps = _mm256_mul_ps(vfeps,H);
1839 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1840 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1841 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
1845 /* Calculate temporary vectorial force */
1846 tx = _mm256_mul_ps(fscal,dx02);
1847 ty = _mm256_mul_ps(fscal,dy02);
1848 tz = _mm256_mul_ps(fscal,dz02);
1850 /* Update vectorial force */
1851 fix0 = _mm256_add_ps(fix0,tx);
1852 fiy0 = _mm256_add_ps(fiy0,ty);
1853 fiz0 = _mm256_add_ps(fiz0,tz);
1855 fjx2 = _mm256_add_ps(fjx2,tx);
1856 fjy2 = _mm256_add_ps(fjy2,ty);
1857 fjz2 = _mm256_add_ps(fjz2,tz);
1859 /**************************
1860 * CALCULATE INTERACTIONS *
1861 **************************/
1863 r10 = _mm256_mul_ps(rsq10,rinv10);
1865 /* Calculate table index by multiplying r with table scale and truncate to integer */
1866 rt = _mm256_mul_ps(r10,vftabscale);
1867 vfitab = _mm256_cvttps_epi32(rt);
1868 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1869 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1870 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1871 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1872 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1873 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1875 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1876 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1877 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1878 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1879 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1880 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1881 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1882 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1883 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1884 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1885 Heps = _mm256_mul_ps(vfeps,H);
1886 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1887 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1888 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
1892 /* Calculate temporary vectorial force */
1893 tx = _mm256_mul_ps(fscal,dx10);
1894 ty = _mm256_mul_ps(fscal,dy10);
1895 tz = _mm256_mul_ps(fscal,dz10);
1897 /* Update vectorial force */
1898 fix1 = _mm256_add_ps(fix1,tx);
1899 fiy1 = _mm256_add_ps(fiy1,ty);
1900 fiz1 = _mm256_add_ps(fiz1,tz);
1902 fjx0 = _mm256_add_ps(fjx0,tx);
1903 fjy0 = _mm256_add_ps(fjy0,ty);
1904 fjz0 = _mm256_add_ps(fjz0,tz);
1906 /**************************
1907 * CALCULATE INTERACTIONS *
1908 **************************/
1910 r11 = _mm256_mul_ps(rsq11,rinv11);
1912 /* Calculate table index by multiplying r with table scale and truncate to integer */
1913 rt = _mm256_mul_ps(r11,vftabscale);
1914 vfitab = _mm256_cvttps_epi32(rt);
1915 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1916 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1917 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1918 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1919 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1920 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1922 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1923 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1924 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1925 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1926 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1927 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1928 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1929 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1930 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1931 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1932 Heps = _mm256_mul_ps(vfeps,H);
1933 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1934 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1935 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
1939 /* Calculate temporary vectorial force */
1940 tx = _mm256_mul_ps(fscal,dx11);
1941 ty = _mm256_mul_ps(fscal,dy11);
1942 tz = _mm256_mul_ps(fscal,dz11);
1944 /* Update vectorial force */
1945 fix1 = _mm256_add_ps(fix1,tx);
1946 fiy1 = _mm256_add_ps(fiy1,ty);
1947 fiz1 = _mm256_add_ps(fiz1,tz);
1949 fjx1 = _mm256_add_ps(fjx1,tx);
1950 fjy1 = _mm256_add_ps(fjy1,ty);
1951 fjz1 = _mm256_add_ps(fjz1,tz);
1953 /**************************
1954 * CALCULATE INTERACTIONS *
1955 **************************/
1957 r12 = _mm256_mul_ps(rsq12,rinv12);
1959 /* Calculate table index by multiplying r with table scale and truncate to integer */
1960 rt = _mm256_mul_ps(r12,vftabscale);
1961 vfitab = _mm256_cvttps_epi32(rt);
1962 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1963 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1964 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1965 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1966 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
1967 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
1969 /* CUBIC SPLINE TABLE ELECTROSTATICS */
1970 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1971 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1972 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1973 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1974 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1975 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1976 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1977 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1978 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1979 Heps = _mm256_mul_ps(vfeps,H);
1980 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1981 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1982 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
1986 /* Calculate temporary vectorial force */
1987 tx = _mm256_mul_ps(fscal,dx12);
1988 ty = _mm256_mul_ps(fscal,dy12);
1989 tz = _mm256_mul_ps(fscal,dz12);
1991 /* Update vectorial force */
1992 fix1 = _mm256_add_ps(fix1,tx);
1993 fiy1 = _mm256_add_ps(fiy1,ty);
1994 fiz1 = _mm256_add_ps(fiz1,tz);
1996 fjx2 = _mm256_add_ps(fjx2,tx);
1997 fjy2 = _mm256_add_ps(fjy2,ty);
1998 fjz2 = _mm256_add_ps(fjz2,tz);
2000 /**************************
2001 * CALCULATE INTERACTIONS *
2002 **************************/
2004 r20 = _mm256_mul_ps(rsq20,rinv20);
2006 /* Calculate table index by multiplying r with table scale and truncate to integer */
2007 rt = _mm256_mul_ps(r20,vftabscale);
2008 vfitab = _mm256_cvttps_epi32(rt);
2009 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2010 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2011 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2012 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2013 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2014 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2016 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2017 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2018 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2019 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2020 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2021 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2022 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2023 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2024 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2025 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2026 Heps = _mm256_mul_ps(vfeps,H);
2027 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2028 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2029 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2033 /* Calculate temporary vectorial force */
2034 tx = _mm256_mul_ps(fscal,dx20);
2035 ty = _mm256_mul_ps(fscal,dy20);
2036 tz = _mm256_mul_ps(fscal,dz20);
2038 /* Update vectorial force */
2039 fix2 = _mm256_add_ps(fix2,tx);
2040 fiy2 = _mm256_add_ps(fiy2,ty);
2041 fiz2 = _mm256_add_ps(fiz2,tz);
2043 fjx0 = _mm256_add_ps(fjx0,tx);
2044 fjy0 = _mm256_add_ps(fjy0,ty);
2045 fjz0 = _mm256_add_ps(fjz0,tz);
2047 /**************************
2048 * CALCULATE INTERACTIONS *
2049 **************************/
2051 r21 = _mm256_mul_ps(rsq21,rinv21);
2053 /* Calculate table index by multiplying r with table scale and truncate to integer */
2054 rt = _mm256_mul_ps(r21,vftabscale);
2055 vfitab = _mm256_cvttps_epi32(rt);
2056 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2057 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2058 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2059 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2060 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2061 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2063 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2064 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2065 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2066 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2067 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2068 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2069 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2070 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2071 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2072 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2073 Heps = _mm256_mul_ps(vfeps,H);
2074 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2075 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2076 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2080 /* Calculate temporary vectorial force */
2081 tx = _mm256_mul_ps(fscal,dx21);
2082 ty = _mm256_mul_ps(fscal,dy21);
2083 tz = _mm256_mul_ps(fscal,dz21);
2085 /* Update vectorial force */
2086 fix2 = _mm256_add_ps(fix2,tx);
2087 fiy2 = _mm256_add_ps(fiy2,ty);
2088 fiz2 = _mm256_add_ps(fiz2,tz);
2090 fjx1 = _mm256_add_ps(fjx1,tx);
2091 fjy1 = _mm256_add_ps(fjy1,ty);
2092 fjz1 = _mm256_add_ps(fjz1,tz);
2094 /**************************
2095 * CALCULATE INTERACTIONS *
2096 **************************/
2098 r22 = _mm256_mul_ps(rsq22,rinv22);
2100 /* Calculate table index by multiplying r with table scale and truncate to integer */
2101 rt = _mm256_mul_ps(r22,vftabscale);
2102 vfitab = _mm256_cvttps_epi32(rt);
2103 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2104 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2105 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2106 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2107 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2108 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2110 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2111 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2112 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2113 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2114 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2115 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2116 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2117 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2118 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2119 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2120 Heps = _mm256_mul_ps(vfeps,H);
2121 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2122 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2123 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2127 /* Calculate temporary vectorial force */
2128 tx = _mm256_mul_ps(fscal,dx22);
2129 ty = _mm256_mul_ps(fscal,dy22);
2130 tz = _mm256_mul_ps(fscal,dz22);
2132 /* Update vectorial force */
2133 fix2 = _mm256_add_ps(fix2,tx);
2134 fiy2 = _mm256_add_ps(fiy2,ty);
2135 fiz2 = _mm256_add_ps(fiz2,tz);
2137 fjx2 = _mm256_add_ps(fjx2,tx);
2138 fjy2 = _mm256_add_ps(fjy2,ty);
2139 fjz2 = _mm256_add_ps(fjz2,tz);
2141 fjptrA = f+j_coord_offsetA;
2142 fjptrB = f+j_coord_offsetB;
2143 fjptrC = f+j_coord_offsetC;
2144 fjptrD = f+j_coord_offsetD;
2145 fjptrE = f+j_coord_offsetE;
2146 fjptrF = f+j_coord_offsetF;
2147 fjptrG = f+j_coord_offsetG;
2148 fjptrH = f+j_coord_offsetH;
2150 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2151 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2153 /* Inner loop uses 359 flops */
2156 if(jidx<j_index_end)
2159 /* Get j neighbor index, and coordinate index */
2160 jnrlistA = jjnr[jidx];
2161 jnrlistB = jjnr[jidx+1];
2162 jnrlistC = jjnr[jidx+2];
2163 jnrlistD = jjnr[jidx+3];
2164 jnrlistE = jjnr[jidx+4];
2165 jnrlistF = jjnr[jidx+5];
2166 jnrlistG = jjnr[jidx+6];
2167 jnrlistH = jjnr[jidx+7];
2168 /* Sign of each element will be negative for non-real atoms.
2169 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
2170 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
2172 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
2173 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
2175 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
2176 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
2177 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
2178 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
2179 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
2180 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
2181 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
2182 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
2183 j_coord_offsetA = DIM*jnrA;
2184 j_coord_offsetB = DIM*jnrB;
2185 j_coord_offsetC = DIM*jnrC;
2186 j_coord_offsetD = DIM*jnrD;
2187 j_coord_offsetE = DIM*jnrE;
2188 j_coord_offsetF = DIM*jnrF;
2189 j_coord_offsetG = DIM*jnrG;
2190 j_coord_offsetH = DIM*jnrH;
2192 /* load j atom coordinates */
2193 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
2194 x+j_coord_offsetC,x+j_coord_offsetD,
2195 x+j_coord_offsetE,x+j_coord_offsetF,
2196 x+j_coord_offsetG,x+j_coord_offsetH,
2197 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
2199 /* Calculate displacement vector */
2200 dx00 = _mm256_sub_ps(ix0,jx0);
2201 dy00 = _mm256_sub_ps(iy0,jy0);
2202 dz00 = _mm256_sub_ps(iz0,jz0);
2203 dx01 = _mm256_sub_ps(ix0,jx1);
2204 dy01 = _mm256_sub_ps(iy0,jy1);
2205 dz01 = _mm256_sub_ps(iz0,jz1);
2206 dx02 = _mm256_sub_ps(ix0,jx2);
2207 dy02 = _mm256_sub_ps(iy0,jy2);
2208 dz02 = _mm256_sub_ps(iz0,jz2);
2209 dx10 = _mm256_sub_ps(ix1,jx0);
2210 dy10 = _mm256_sub_ps(iy1,jy0);
2211 dz10 = _mm256_sub_ps(iz1,jz0);
2212 dx11 = _mm256_sub_ps(ix1,jx1);
2213 dy11 = _mm256_sub_ps(iy1,jy1);
2214 dz11 = _mm256_sub_ps(iz1,jz1);
2215 dx12 = _mm256_sub_ps(ix1,jx2);
2216 dy12 = _mm256_sub_ps(iy1,jy2);
2217 dz12 = _mm256_sub_ps(iz1,jz2);
2218 dx20 = _mm256_sub_ps(ix2,jx0);
2219 dy20 = _mm256_sub_ps(iy2,jy0);
2220 dz20 = _mm256_sub_ps(iz2,jz0);
2221 dx21 = _mm256_sub_ps(ix2,jx1);
2222 dy21 = _mm256_sub_ps(iy2,jy1);
2223 dz21 = _mm256_sub_ps(iz2,jz1);
2224 dx22 = _mm256_sub_ps(ix2,jx2);
2225 dy22 = _mm256_sub_ps(iy2,jy2);
2226 dz22 = _mm256_sub_ps(iz2,jz2);
2228 /* Calculate squared distance and things based on it */
2229 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
2230 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
2231 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
2232 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
2233 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
2234 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
2235 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
2236 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
2237 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
2239 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
2240 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
2241 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
2242 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
2243 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
2244 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
2245 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
2246 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
2247 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
2249 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
2251 fjx0 = _mm256_setzero_ps();
2252 fjy0 = _mm256_setzero_ps();
2253 fjz0 = _mm256_setzero_ps();
2254 fjx1 = _mm256_setzero_ps();
2255 fjy1 = _mm256_setzero_ps();
2256 fjz1 = _mm256_setzero_ps();
2257 fjx2 = _mm256_setzero_ps();
2258 fjy2 = _mm256_setzero_ps();
2259 fjz2 = _mm256_setzero_ps();
2261 /**************************
2262 * CALCULATE INTERACTIONS *
2263 **************************/
2265 r00 = _mm256_mul_ps(rsq00,rinv00);
2266 r00 = _mm256_andnot_ps(dummy_mask,r00);
2268 /* Calculate table index by multiplying r with table scale and truncate to integer */
2269 rt = _mm256_mul_ps(r00,vftabscale);
2270 vfitab = _mm256_cvttps_epi32(rt);
2271 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2272 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2273 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2274 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2275 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2276 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2278 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2279 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2280 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2281 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2282 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2283 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2284 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2285 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2286 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2287 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2288 Heps = _mm256_mul_ps(vfeps,H);
2289 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2290 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2291 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq00,FF),_mm256_mul_ps(vftabscale,rinv00)));
2293 /* LENNARD-JONES DISPERSION/REPULSION */
2295 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
2296 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
2298 fscal = _mm256_add_ps(felec,fvdw);
2300 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2302 /* Calculate temporary vectorial force */
2303 tx = _mm256_mul_ps(fscal,dx00);
2304 ty = _mm256_mul_ps(fscal,dy00);
2305 tz = _mm256_mul_ps(fscal,dz00);
2307 /* Update vectorial force */
2308 fix0 = _mm256_add_ps(fix0,tx);
2309 fiy0 = _mm256_add_ps(fiy0,ty);
2310 fiz0 = _mm256_add_ps(fiz0,tz);
2312 fjx0 = _mm256_add_ps(fjx0,tx);
2313 fjy0 = _mm256_add_ps(fjy0,ty);
2314 fjz0 = _mm256_add_ps(fjz0,tz);
2316 /**************************
2317 * CALCULATE INTERACTIONS *
2318 **************************/
2320 r01 = _mm256_mul_ps(rsq01,rinv01);
2321 r01 = _mm256_andnot_ps(dummy_mask,r01);
2323 /* Calculate table index by multiplying r with table scale and truncate to integer */
2324 rt = _mm256_mul_ps(r01,vftabscale);
2325 vfitab = _mm256_cvttps_epi32(rt);
2326 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2327 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2328 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2329 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2330 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2331 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2333 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2334 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2335 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2336 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2337 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2338 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2339 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2340 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2341 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2342 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2343 Heps = _mm256_mul_ps(vfeps,H);
2344 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2345 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2346 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq01,FF),_mm256_mul_ps(vftabscale,rinv01)));
2350 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2352 /* Calculate temporary vectorial force */
2353 tx = _mm256_mul_ps(fscal,dx01);
2354 ty = _mm256_mul_ps(fscal,dy01);
2355 tz = _mm256_mul_ps(fscal,dz01);
2357 /* Update vectorial force */
2358 fix0 = _mm256_add_ps(fix0,tx);
2359 fiy0 = _mm256_add_ps(fiy0,ty);
2360 fiz0 = _mm256_add_ps(fiz0,tz);
2362 fjx1 = _mm256_add_ps(fjx1,tx);
2363 fjy1 = _mm256_add_ps(fjy1,ty);
2364 fjz1 = _mm256_add_ps(fjz1,tz);
2366 /**************************
2367 * CALCULATE INTERACTIONS *
2368 **************************/
2370 r02 = _mm256_mul_ps(rsq02,rinv02);
2371 r02 = _mm256_andnot_ps(dummy_mask,r02);
2373 /* Calculate table index by multiplying r with table scale and truncate to integer */
2374 rt = _mm256_mul_ps(r02,vftabscale);
2375 vfitab = _mm256_cvttps_epi32(rt);
2376 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2377 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2378 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2379 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2380 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2381 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2383 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2384 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2385 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2386 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2387 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2388 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2389 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2390 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2391 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2392 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2393 Heps = _mm256_mul_ps(vfeps,H);
2394 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2395 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2396 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq02,FF),_mm256_mul_ps(vftabscale,rinv02)));
2400 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2402 /* Calculate temporary vectorial force */
2403 tx = _mm256_mul_ps(fscal,dx02);
2404 ty = _mm256_mul_ps(fscal,dy02);
2405 tz = _mm256_mul_ps(fscal,dz02);
2407 /* Update vectorial force */
2408 fix0 = _mm256_add_ps(fix0,tx);
2409 fiy0 = _mm256_add_ps(fiy0,ty);
2410 fiz0 = _mm256_add_ps(fiz0,tz);
2412 fjx2 = _mm256_add_ps(fjx2,tx);
2413 fjy2 = _mm256_add_ps(fjy2,ty);
2414 fjz2 = _mm256_add_ps(fjz2,tz);
2416 /**************************
2417 * CALCULATE INTERACTIONS *
2418 **************************/
2420 r10 = _mm256_mul_ps(rsq10,rinv10);
2421 r10 = _mm256_andnot_ps(dummy_mask,r10);
2423 /* Calculate table index by multiplying r with table scale and truncate to integer */
2424 rt = _mm256_mul_ps(r10,vftabscale);
2425 vfitab = _mm256_cvttps_epi32(rt);
2426 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2427 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2428 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2429 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2430 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2431 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2433 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2434 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2435 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2436 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2437 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2438 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2439 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2440 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2441 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2442 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2443 Heps = _mm256_mul_ps(vfeps,H);
2444 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2445 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2446 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq10,FF),_mm256_mul_ps(vftabscale,rinv10)));
2450 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2452 /* Calculate temporary vectorial force */
2453 tx = _mm256_mul_ps(fscal,dx10);
2454 ty = _mm256_mul_ps(fscal,dy10);
2455 tz = _mm256_mul_ps(fscal,dz10);
2457 /* Update vectorial force */
2458 fix1 = _mm256_add_ps(fix1,tx);
2459 fiy1 = _mm256_add_ps(fiy1,ty);
2460 fiz1 = _mm256_add_ps(fiz1,tz);
2462 fjx0 = _mm256_add_ps(fjx0,tx);
2463 fjy0 = _mm256_add_ps(fjy0,ty);
2464 fjz0 = _mm256_add_ps(fjz0,tz);
2466 /**************************
2467 * CALCULATE INTERACTIONS *
2468 **************************/
2470 r11 = _mm256_mul_ps(rsq11,rinv11);
2471 r11 = _mm256_andnot_ps(dummy_mask,r11);
2473 /* Calculate table index by multiplying r with table scale and truncate to integer */
2474 rt = _mm256_mul_ps(r11,vftabscale);
2475 vfitab = _mm256_cvttps_epi32(rt);
2476 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2477 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2478 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2479 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2480 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2481 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2483 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2484 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2485 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2486 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2487 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2488 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2489 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2490 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2491 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2492 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2493 Heps = _mm256_mul_ps(vfeps,H);
2494 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2495 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2496 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq11,FF),_mm256_mul_ps(vftabscale,rinv11)));
2500 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2502 /* Calculate temporary vectorial force */
2503 tx = _mm256_mul_ps(fscal,dx11);
2504 ty = _mm256_mul_ps(fscal,dy11);
2505 tz = _mm256_mul_ps(fscal,dz11);
2507 /* Update vectorial force */
2508 fix1 = _mm256_add_ps(fix1,tx);
2509 fiy1 = _mm256_add_ps(fiy1,ty);
2510 fiz1 = _mm256_add_ps(fiz1,tz);
2512 fjx1 = _mm256_add_ps(fjx1,tx);
2513 fjy1 = _mm256_add_ps(fjy1,ty);
2514 fjz1 = _mm256_add_ps(fjz1,tz);
2516 /**************************
2517 * CALCULATE INTERACTIONS *
2518 **************************/
2520 r12 = _mm256_mul_ps(rsq12,rinv12);
2521 r12 = _mm256_andnot_ps(dummy_mask,r12);
2523 /* Calculate table index by multiplying r with table scale and truncate to integer */
2524 rt = _mm256_mul_ps(r12,vftabscale);
2525 vfitab = _mm256_cvttps_epi32(rt);
2526 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2527 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2528 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2529 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2530 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2531 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2533 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2534 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2535 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2536 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2537 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2538 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2539 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2540 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2541 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2542 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2543 Heps = _mm256_mul_ps(vfeps,H);
2544 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2545 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2546 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq12,FF),_mm256_mul_ps(vftabscale,rinv12)));
2550 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2552 /* Calculate temporary vectorial force */
2553 tx = _mm256_mul_ps(fscal,dx12);
2554 ty = _mm256_mul_ps(fscal,dy12);
2555 tz = _mm256_mul_ps(fscal,dz12);
2557 /* Update vectorial force */
2558 fix1 = _mm256_add_ps(fix1,tx);
2559 fiy1 = _mm256_add_ps(fiy1,ty);
2560 fiz1 = _mm256_add_ps(fiz1,tz);
2562 fjx2 = _mm256_add_ps(fjx2,tx);
2563 fjy2 = _mm256_add_ps(fjy2,ty);
2564 fjz2 = _mm256_add_ps(fjz2,tz);
2566 /**************************
2567 * CALCULATE INTERACTIONS *
2568 **************************/
2570 r20 = _mm256_mul_ps(rsq20,rinv20);
2571 r20 = _mm256_andnot_ps(dummy_mask,r20);
2573 /* Calculate table index by multiplying r with table scale and truncate to integer */
2574 rt = _mm256_mul_ps(r20,vftabscale);
2575 vfitab = _mm256_cvttps_epi32(rt);
2576 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2577 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2578 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2579 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2580 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2581 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2583 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2584 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2585 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2586 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2587 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2588 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2589 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2590 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2591 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2592 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2593 Heps = _mm256_mul_ps(vfeps,H);
2594 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2595 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2596 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq20,FF),_mm256_mul_ps(vftabscale,rinv20)));
2600 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2602 /* Calculate temporary vectorial force */
2603 tx = _mm256_mul_ps(fscal,dx20);
2604 ty = _mm256_mul_ps(fscal,dy20);
2605 tz = _mm256_mul_ps(fscal,dz20);
2607 /* Update vectorial force */
2608 fix2 = _mm256_add_ps(fix2,tx);
2609 fiy2 = _mm256_add_ps(fiy2,ty);
2610 fiz2 = _mm256_add_ps(fiz2,tz);
2612 fjx0 = _mm256_add_ps(fjx0,tx);
2613 fjy0 = _mm256_add_ps(fjy0,ty);
2614 fjz0 = _mm256_add_ps(fjz0,tz);
2616 /**************************
2617 * CALCULATE INTERACTIONS *
2618 **************************/
2620 r21 = _mm256_mul_ps(rsq21,rinv21);
2621 r21 = _mm256_andnot_ps(dummy_mask,r21);
2623 /* Calculate table index by multiplying r with table scale and truncate to integer */
2624 rt = _mm256_mul_ps(r21,vftabscale);
2625 vfitab = _mm256_cvttps_epi32(rt);
2626 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2627 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2628 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2629 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2630 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2631 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2633 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2634 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2635 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2636 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2637 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2638 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2639 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2640 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2641 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2642 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2643 Heps = _mm256_mul_ps(vfeps,H);
2644 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2645 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2646 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq21,FF),_mm256_mul_ps(vftabscale,rinv21)));
2650 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2652 /* Calculate temporary vectorial force */
2653 tx = _mm256_mul_ps(fscal,dx21);
2654 ty = _mm256_mul_ps(fscal,dy21);
2655 tz = _mm256_mul_ps(fscal,dz21);
2657 /* Update vectorial force */
2658 fix2 = _mm256_add_ps(fix2,tx);
2659 fiy2 = _mm256_add_ps(fiy2,ty);
2660 fiz2 = _mm256_add_ps(fiz2,tz);
2662 fjx1 = _mm256_add_ps(fjx1,tx);
2663 fjy1 = _mm256_add_ps(fjy1,ty);
2664 fjz1 = _mm256_add_ps(fjz1,tz);
2666 /**************************
2667 * CALCULATE INTERACTIONS *
2668 **************************/
2670 r22 = _mm256_mul_ps(rsq22,rinv22);
2671 r22 = _mm256_andnot_ps(dummy_mask,r22);
2673 /* Calculate table index by multiplying r with table scale and truncate to integer */
2674 rt = _mm256_mul_ps(r22,vftabscale);
2675 vfitab = _mm256_cvttps_epi32(rt);
2676 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2677 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2678 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2679 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2680 vfitab_lo = _mm_slli_epi32(vfitab_lo,2);
2681 vfitab_hi = _mm_slli_epi32(vfitab_hi,2);
2683 /* CUBIC SPLINE TABLE ELECTROSTATICS */
2684 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2685 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2686 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2687 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2688 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2689 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2690 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2691 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2692 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2693 Heps = _mm256_mul_ps(vfeps,H);
2694 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2695 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2696 felec = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_mul_ps(qq22,FF),_mm256_mul_ps(vftabscale,rinv22)));
2700 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2702 /* Calculate temporary vectorial force */
2703 tx = _mm256_mul_ps(fscal,dx22);
2704 ty = _mm256_mul_ps(fscal,dy22);
2705 tz = _mm256_mul_ps(fscal,dz22);
2707 /* Update vectorial force */
2708 fix2 = _mm256_add_ps(fix2,tx);
2709 fiy2 = _mm256_add_ps(fiy2,ty);
2710 fiz2 = _mm256_add_ps(fiz2,tz);
2712 fjx2 = _mm256_add_ps(fjx2,tx);
2713 fjy2 = _mm256_add_ps(fjy2,ty);
2714 fjz2 = _mm256_add_ps(fjz2,tz);
2716 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2717 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2718 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2719 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2720 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2721 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2722 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2723 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2725 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2726 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2728 /* Inner loop uses 368 flops */
2731 /* End of innermost loop */
2733 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2734 f+i_coord_offset,fshift+i_shift_offset);
2736 /* Increment number of inner iterations */
2737 inneriter += j_index_end - j_index_start;
2739 /* Outer loop uses 18 flops */
2742 /* Increment number of outer iterations */
2745 /* Update outer/inner flops */
2747 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*368);