2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_256_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
81 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
83 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
85 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
86 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
87 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
88 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
89 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
90 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
91 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
92 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
93 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
96 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
100 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
102 __m128i vfitab_lo,vfitab_hi;
103 __m128i ifour = _mm_set1_epi32(4);
104 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
106 __m256 dummy_mask,cutoff_mask;
107 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
108 __m256 one = _mm256_set1_ps(1.0);
109 __m256 two = _mm256_set1_ps(2.0);
115 jindex = nlist->jindex;
117 shiftidx = nlist->shift;
119 shiftvec = fr->shift_vec[0];
120 fshift = fr->fshift[0];
121 facel = _mm256_set1_ps(fr->epsfac);
122 charge = mdatoms->chargeA;
123 krf = _mm256_set1_ps(fr->ic->k_rf);
124 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
125 crf = _mm256_set1_ps(fr->ic->c_rf);
126 nvdwtype = fr->ntype;
128 vdwtype = mdatoms->typeA;
130 vftab = kernel_data->table_vdw->data;
131 vftabscale = _mm256_set1_ps(kernel_data->table_vdw->scale);
133 /* Setup water-specific parameters */
134 inr = nlist->iinr[0];
135 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
136 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
137 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
138 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
140 jq0 = _mm256_set1_ps(charge[inr+0]);
141 jq1 = _mm256_set1_ps(charge[inr+1]);
142 jq2 = _mm256_set1_ps(charge[inr+2]);
143 vdwjidx0A = 2*vdwtype[inr+0];
144 qq00 = _mm256_mul_ps(iq0,jq0);
145 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
146 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
147 qq01 = _mm256_mul_ps(iq0,jq1);
148 qq02 = _mm256_mul_ps(iq0,jq2);
149 qq10 = _mm256_mul_ps(iq1,jq0);
150 qq11 = _mm256_mul_ps(iq1,jq1);
151 qq12 = _mm256_mul_ps(iq1,jq2);
152 qq20 = _mm256_mul_ps(iq2,jq0);
153 qq21 = _mm256_mul_ps(iq2,jq1);
154 qq22 = _mm256_mul_ps(iq2,jq2);
156 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
157 rcutoff_scalar = fr->rcoulomb;
158 rcutoff = _mm256_set1_ps(rcutoff_scalar);
159 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
161 /* Avoid stupid compiler warnings */
162 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
175 for(iidx=0;iidx<4*DIM;iidx++)
180 /* Start outer loop over neighborlists */
181 for(iidx=0; iidx<nri; iidx++)
183 /* Load shift vector for this list */
184 i_shift_offset = DIM*shiftidx[iidx];
186 /* Load limits for loop over neighbors */
187 j_index_start = jindex[iidx];
188 j_index_end = jindex[iidx+1];
190 /* Get outer coordinate index */
192 i_coord_offset = DIM*inr;
194 /* Load i particle coords and add shift vector */
195 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
196 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
198 fix0 = _mm256_setzero_ps();
199 fiy0 = _mm256_setzero_ps();
200 fiz0 = _mm256_setzero_ps();
201 fix1 = _mm256_setzero_ps();
202 fiy1 = _mm256_setzero_ps();
203 fiz1 = _mm256_setzero_ps();
204 fix2 = _mm256_setzero_ps();
205 fiy2 = _mm256_setzero_ps();
206 fiz2 = _mm256_setzero_ps();
208 /* Reset potential sums */
209 velecsum = _mm256_setzero_ps();
210 vvdwsum = _mm256_setzero_ps();
212 /* Start inner kernel loop */
213 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
216 /* Get j neighbor index, and coordinate index */
225 j_coord_offsetA = DIM*jnrA;
226 j_coord_offsetB = DIM*jnrB;
227 j_coord_offsetC = DIM*jnrC;
228 j_coord_offsetD = DIM*jnrD;
229 j_coord_offsetE = DIM*jnrE;
230 j_coord_offsetF = DIM*jnrF;
231 j_coord_offsetG = DIM*jnrG;
232 j_coord_offsetH = DIM*jnrH;
234 /* load j atom coordinates */
235 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
236 x+j_coord_offsetC,x+j_coord_offsetD,
237 x+j_coord_offsetE,x+j_coord_offsetF,
238 x+j_coord_offsetG,x+j_coord_offsetH,
239 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
241 /* Calculate displacement vector */
242 dx00 = _mm256_sub_ps(ix0,jx0);
243 dy00 = _mm256_sub_ps(iy0,jy0);
244 dz00 = _mm256_sub_ps(iz0,jz0);
245 dx01 = _mm256_sub_ps(ix0,jx1);
246 dy01 = _mm256_sub_ps(iy0,jy1);
247 dz01 = _mm256_sub_ps(iz0,jz1);
248 dx02 = _mm256_sub_ps(ix0,jx2);
249 dy02 = _mm256_sub_ps(iy0,jy2);
250 dz02 = _mm256_sub_ps(iz0,jz2);
251 dx10 = _mm256_sub_ps(ix1,jx0);
252 dy10 = _mm256_sub_ps(iy1,jy0);
253 dz10 = _mm256_sub_ps(iz1,jz0);
254 dx11 = _mm256_sub_ps(ix1,jx1);
255 dy11 = _mm256_sub_ps(iy1,jy1);
256 dz11 = _mm256_sub_ps(iz1,jz1);
257 dx12 = _mm256_sub_ps(ix1,jx2);
258 dy12 = _mm256_sub_ps(iy1,jy2);
259 dz12 = _mm256_sub_ps(iz1,jz2);
260 dx20 = _mm256_sub_ps(ix2,jx0);
261 dy20 = _mm256_sub_ps(iy2,jy0);
262 dz20 = _mm256_sub_ps(iz2,jz0);
263 dx21 = _mm256_sub_ps(ix2,jx1);
264 dy21 = _mm256_sub_ps(iy2,jy1);
265 dz21 = _mm256_sub_ps(iz2,jz1);
266 dx22 = _mm256_sub_ps(ix2,jx2);
267 dy22 = _mm256_sub_ps(iy2,jy2);
268 dz22 = _mm256_sub_ps(iz2,jz2);
270 /* Calculate squared distance and things based on it */
271 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
272 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
273 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
274 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
275 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
276 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
277 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
278 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
279 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
281 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
282 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
283 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
284 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
285 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
286 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
287 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
288 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
289 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
291 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
292 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
293 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
294 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
295 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
296 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
297 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
298 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
299 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
301 fjx0 = _mm256_setzero_ps();
302 fjy0 = _mm256_setzero_ps();
303 fjz0 = _mm256_setzero_ps();
304 fjx1 = _mm256_setzero_ps();
305 fjy1 = _mm256_setzero_ps();
306 fjz1 = _mm256_setzero_ps();
307 fjx2 = _mm256_setzero_ps();
308 fjy2 = _mm256_setzero_ps();
309 fjz2 = _mm256_setzero_ps();
311 /**************************
312 * CALCULATE INTERACTIONS *
313 **************************/
315 if (gmx_mm256_any_lt(rsq00,rcutoff2))
318 r00 = _mm256_mul_ps(rsq00,rinv00);
320 /* Calculate table index by multiplying r with table scale and truncate to integer */
321 rt = _mm256_mul_ps(r00,vftabscale);
322 vfitab = _mm256_cvttps_epi32(rt);
323 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
324 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
325 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
326 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
327 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
328 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
330 /* REACTION-FIELD ELECTROSTATICS */
331 velec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
332 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
334 /* CUBIC SPLINE TABLE DISPERSION */
335 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
336 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
337 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
338 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
339 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
340 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
341 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
342 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
343 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
344 Heps = _mm256_mul_ps(vfeps,H);
345 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
346 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
347 vvdw6 = _mm256_mul_ps(c6_00,VV);
348 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
349 fvdw6 = _mm256_mul_ps(c6_00,FF);
351 /* CUBIC SPLINE TABLE REPULSION */
352 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
353 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
354 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
355 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
356 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
357 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
358 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
359 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
360 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
361 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
362 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
363 Heps = _mm256_mul_ps(vfeps,H);
364 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
365 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
366 vvdw12 = _mm256_mul_ps(c12_00,VV);
367 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
368 fvdw12 = _mm256_mul_ps(c12_00,FF);
369 vvdw = _mm256_add_ps(vvdw12,vvdw6);
370 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
372 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
374 /* Update potential sum for this i atom from the interaction with this j atom. */
375 velec = _mm256_and_ps(velec,cutoff_mask);
376 velecsum = _mm256_add_ps(velecsum,velec);
377 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
378 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
380 fscal = _mm256_add_ps(felec,fvdw);
382 fscal = _mm256_and_ps(fscal,cutoff_mask);
384 /* Calculate temporary vectorial force */
385 tx = _mm256_mul_ps(fscal,dx00);
386 ty = _mm256_mul_ps(fscal,dy00);
387 tz = _mm256_mul_ps(fscal,dz00);
389 /* Update vectorial force */
390 fix0 = _mm256_add_ps(fix0,tx);
391 fiy0 = _mm256_add_ps(fiy0,ty);
392 fiz0 = _mm256_add_ps(fiz0,tz);
394 fjx0 = _mm256_add_ps(fjx0,tx);
395 fjy0 = _mm256_add_ps(fjy0,ty);
396 fjz0 = _mm256_add_ps(fjz0,tz);
400 /**************************
401 * CALCULATE INTERACTIONS *
402 **************************/
404 if (gmx_mm256_any_lt(rsq01,rcutoff2))
407 /* REACTION-FIELD ELECTROSTATICS */
408 velec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
409 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
411 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
413 /* Update potential sum for this i atom from the interaction with this j atom. */
414 velec = _mm256_and_ps(velec,cutoff_mask);
415 velecsum = _mm256_add_ps(velecsum,velec);
419 fscal = _mm256_and_ps(fscal,cutoff_mask);
421 /* Calculate temporary vectorial force */
422 tx = _mm256_mul_ps(fscal,dx01);
423 ty = _mm256_mul_ps(fscal,dy01);
424 tz = _mm256_mul_ps(fscal,dz01);
426 /* Update vectorial force */
427 fix0 = _mm256_add_ps(fix0,tx);
428 fiy0 = _mm256_add_ps(fiy0,ty);
429 fiz0 = _mm256_add_ps(fiz0,tz);
431 fjx1 = _mm256_add_ps(fjx1,tx);
432 fjy1 = _mm256_add_ps(fjy1,ty);
433 fjz1 = _mm256_add_ps(fjz1,tz);
437 /**************************
438 * CALCULATE INTERACTIONS *
439 **************************/
441 if (gmx_mm256_any_lt(rsq02,rcutoff2))
444 /* REACTION-FIELD ELECTROSTATICS */
445 velec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
446 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
448 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
450 /* Update potential sum for this i atom from the interaction with this j atom. */
451 velec = _mm256_and_ps(velec,cutoff_mask);
452 velecsum = _mm256_add_ps(velecsum,velec);
456 fscal = _mm256_and_ps(fscal,cutoff_mask);
458 /* Calculate temporary vectorial force */
459 tx = _mm256_mul_ps(fscal,dx02);
460 ty = _mm256_mul_ps(fscal,dy02);
461 tz = _mm256_mul_ps(fscal,dz02);
463 /* Update vectorial force */
464 fix0 = _mm256_add_ps(fix0,tx);
465 fiy0 = _mm256_add_ps(fiy0,ty);
466 fiz0 = _mm256_add_ps(fiz0,tz);
468 fjx2 = _mm256_add_ps(fjx2,tx);
469 fjy2 = _mm256_add_ps(fjy2,ty);
470 fjz2 = _mm256_add_ps(fjz2,tz);
474 /**************************
475 * CALCULATE INTERACTIONS *
476 **************************/
478 if (gmx_mm256_any_lt(rsq10,rcutoff2))
481 /* REACTION-FIELD ELECTROSTATICS */
482 velec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
483 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
485 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
487 /* Update potential sum for this i atom from the interaction with this j atom. */
488 velec = _mm256_and_ps(velec,cutoff_mask);
489 velecsum = _mm256_add_ps(velecsum,velec);
493 fscal = _mm256_and_ps(fscal,cutoff_mask);
495 /* Calculate temporary vectorial force */
496 tx = _mm256_mul_ps(fscal,dx10);
497 ty = _mm256_mul_ps(fscal,dy10);
498 tz = _mm256_mul_ps(fscal,dz10);
500 /* Update vectorial force */
501 fix1 = _mm256_add_ps(fix1,tx);
502 fiy1 = _mm256_add_ps(fiy1,ty);
503 fiz1 = _mm256_add_ps(fiz1,tz);
505 fjx0 = _mm256_add_ps(fjx0,tx);
506 fjy0 = _mm256_add_ps(fjy0,ty);
507 fjz0 = _mm256_add_ps(fjz0,tz);
511 /**************************
512 * CALCULATE INTERACTIONS *
513 **************************/
515 if (gmx_mm256_any_lt(rsq11,rcutoff2))
518 /* REACTION-FIELD ELECTROSTATICS */
519 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
520 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
522 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
524 /* Update potential sum for this i atom from the interaction with this j atom. */
525 velec = _mm256_and_ps(velec,cutoff_mask);
526 velecsum = _mm256_add_ps(velecsum,velec);
530 fscal = _mm256_and_ps(fscal,cutoff_mask);
532 /* Calculate temporary vectorial force */
533 tx = _mm256_mul_ps(fscal,dx11);
534 ty = _mm256_mul_ps(fscal,dy11);
535 tz = _mm256_mul_ps(fscal,dz11);
537 /* Update vectorial force */
538 fix1 = _mm256_add_ps(fix1,tx);
539 fiy1 = _mm256_add_ps(fiy1,ty);
540 fiz1 = _mm256_add_ps(fiz1,tz);
542 fjx1 = _mm256_add_ps(fjx1,tx);
543 fjy1 = _mm256_add_ps(fjy1,ty);
544 fjz1 = _mm256_add_ps(fjz1,tz);
548 /**************************
549 * CALCULATE INTERACTIONS *
550 **************************/
552 if (gmx_mm256_any_lt(rsq12,rcutoff2))
555 /* REACTION-FIELD ELECTROSTATICS */
556 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
557 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
559 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
561 /* Update potential sum for this i atom from the interaction with this j atom. */
562 velec = _mm256_and_ps(velec,cutoff_mask);
563 velecsum = _mm256_add_ps(velecsum,velec);
567 fscal = _mm256_and_ps(fscal,cutoff_mask);
569 /* Calculate temporary vectorial force */
570 tx = _mm256_mul_ps(fscal,dx12);
571 ty = _mm256_mul_ps(fscal,dy12);
572 tz = _mm256_mul_ps(fscal,dz12);
574 /* Update vectorial force */
575 fix1 = _mm256_add_ps(fix1,tx);
576 fiy1 = _mm256_add_ps(fiy1,ty);
577 fiz1 = _mm256_add_ps(fiz1,tz);
579 fjx2 = _mm256_add_ps(fjx2,tx);
580 fjy2 = _mm256_add_ps(fjy2,ty);
581 fjz2 = _mm256_add_ps(fjz2,tz);
585 /**************************
586 * CALCULATE INTERACTIONS *
587 **************************/
589 if (gmx_mm256_any_lt(rsq20,rcutoff2))
592 /* REACTION-FIELD ELECTROSTATICS */
593 velec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
594 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
596 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
598 /* Update potential sum for this i atom from the interaction with this j atom. */
599 velec = _mm256_and_ps(velec,cutoff_mask);
600 velecsum = _mm256_add_ps(velecsum,velec);
604 fscal = _mm256_and_ps(fscal,cutoff_mask);
606 /* Calculate temporary vectorial force */
607 tx = _mm256_mul_ps(fscal,dx20);
608 ty = _mm256_mul_ps(fscal,dy20);
609 tz = _mm256_mul_ps(fscal,dz20);
611 /* Update vectorial force */
612 fix2 = _mm256_add_ps(fix2,tx);
613 fiy2 = _mm256_add_ps(fiy2,ty);
614 fiz2 = _mm256_add_ps(fiz2,tz);
616 fjx0 = _mm256_add_ps(fjx0,tx);
617 fjy0 = _mm256_add_ps(fjy0,ty);
618 fjz0 = _mm256_add_ps(fjz0,tz);
622 /**************************
623 * CALCULATE INTERACTIONS *
624 **************************/
626 if (gmx_mm256_any_lt(rsq21,rcutoff2))
629 /* REACTION-FIELD ELECTROSTATICS */
630 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
631 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
633 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
635 /* Update potential sum for this i atom from the interaction with this j atom. */
636 velec = _mm256_and_ps(velec,cutoff_mask);
637 velecsum = _mm256_add_ps(velecsum,velec);
641 fscal = _mm256_and_ps(fscal,cutoff_mask);
643 /* Calculate temporary vectorial force */
644 tx = _mm256_mul_ps(fscal,dx21);
645 ty = _mm256_mul_ps(fscal,dy21);
646 tz = _mm256_mul_ps(fscal,dz21);
648 /* Update vectorial force */
649 fix2 = _mm256_add_ps(fix2,tx);
650 fiy2 = _mm256_add_ps(fiy2,ty);
651 fiz2 = _mm256_add_ps(fiz2,tz);
653 fjx1 = _mm256_add_ps(fjx1,tx);
654 fjy1 = _mm256_add_ps(fjy1,ty);
655 fjz1 = _mm256_add_ps(fjz1,tz);
659 /**************************
660 * CALCULATE INTERACTIONS *
661 **************************/
663 if (gmx_mm256_any_lt(rsq22,rcutoff2))
666 /* REACTION-FIELD ELECTROSTATICS */
667 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
668 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
670 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
672 /* Update potential sum for this i atom from the interaction with this j atom. */
673 velec = _mm256_and_ps(velec,cutoff_mask);
674 velecsum = _mm256_add_ps(velecsum,velec);
678 fscal = _mm256_and_ps(fscal,cutoff_mask);
680 /* Calculate temporary vectorial force */
681 tx = _mm256_mul_ps(fscal,dx22);
682 ty = _mm256_mul_ps(fscal,dy22);
683 tz = _mm256_mul_ps(fscal,dz22);
685 /* Update vectorial force */
686 fix2 = _mm256_add_ps(fix2,tx);
687 fiy2 = _mm256_add_ps(fiy2,ty);
688 fiz2 = _mm256_add_ps(fiz2,tz);
690 fjx2 = _mm256_add_ps(fjx2,tx);
691 fjy2 = _mm256_add_ps(fjy2,ty);
692 fjz2 = _mm256_add_ps(fjz2,tz);
696 fjptrA = f+j_coord_offsetA;
697 fjptrB = f+j_coord_offsetB;
698 fjptrC = f+j_coord_offsetC;
699 fjptrD = f+j_coord_offsetD;
700 fjptrE = f+j_coord_offsetE;
701 fjptrF = f+j_coord_offsetF;
702 fjptrG = f+j_coord_offsetG;
703 fjptrH = f+j_coord_offsetH;
705 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
706 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
708 /* Inner loop uses 360 flops */
714 /* Get j neighbor index, and coordinate index */
715 jnrlistA = jjnr[jidx];
716 jnrlistB = jjnr[jidx+1];
717 jnrlistC = jjnr[jidx+2];
718 jnrlistD = jjnr[jidx+3];
719 jnrlistE = jjnr[jidx+4];
720 jnrlistF = jjnr[jidx+5];
721 jnrlistG = jjnr[jidx+6];
722 jnrlistH = jjnr[jidx+7];
723 /* Sign of each element will be negative for non-real atoms.
724 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
725 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
727 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
728 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
730 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
731 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
732 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
733 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
734 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
735 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
736 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
737 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
738 j_coord_offsetA = DIM*jnrA;
739 j_coord_offsetB = DIM*jnrB;
740 j_coord_offsetC = DIM*jnrC;
741 j_coord_offsetD = DIM*jnrD;
742 j_coord_offsetE = DIM*jnrE;
743 j_coord_offsetF = DIM*jnrF;
744 j_coord_offsetG = DIM*jnrG;
745 j_coord_offsetH = DIM*jnrH;
747 /* load j atom coordinates */
748 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
749 x+j_coord_offsetC,x+j_coord_offsetD,
750 x+j_coord_offsetE,x+j_coord_offsetF,
751 x+j_coord_offsetG,x+j_coord_offsetH,
752 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
754 /* Calculate displacement vector */
755 dx00 = _mm256_sub_ps(ix0,jx0);
756 dy00 = _mm256_sub_ps(iy0,jy0);
757 dz00 = _mm256_sub_ps(iz0,jz0);
758 dx01 = _mm256_sub_ps(ix0,jx1);
759 dy01 = _mm256_sub_ps(iy0,jy1);
760 dz01 = _mm256_sub_ps(iz0,jz1);
761 dx02 = _mm256_sub_ps(ix0,jx2);
762 dy02 = _mm256_sub_ps(iy0,jy2);
763 dz02 = _mm256_sub_ps(iz0,jz2);
764 dx10 = _mm256_sub_ps(ix1,jx0);
765 dy10 = _mm256_sub_ps(iy1,jy0);
766 dz10 = _mm256_sub_ps(iz1,jz0);
767 dx11 = _mm256_sub_ps(ix1,jx1);
768 dy11 = _mm256_sub_ps(iy1,jy1);
769 dz11 = _mm256_sub_ps(iz1,jz1);
770 dx12 = _mm256_sub_ps(ix1,jx2);
771 dy12 = _mm256_sub_ps(iy1,jy2);
772 dz12 = _mm256_sub_ps(iz1,jz2);
773 dx20 = _mm256_sub_ps(ix2,jx0);
774 dy20 = _mm256_sub_ps(iy2,jy0);
775 dz20 = _mm256_sub_ps(iz2,jz0);
776 dx21 = _mm256_sub_ps(ix2,jx1);
777 dy21 = _mm256_sub_ps(iy2,jy1);
778 dz21 = _mm256_sub_ps(iz2,jz1);
779 dx22 = _mm256_sub_ps(ix2,jx2);
780 dy22 = _mm256_sub_ps(iy2,jy2);
781 dz22 = _mm256_sub_ps(iz2,jz2);
783 /* Calculate squared distance and things based on it */
784 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
785 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
786 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
787 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
788 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
789 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
790 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
791 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
792 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
794 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
795 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
796 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
797 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
798 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
799 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
800 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
801 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
802 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
804 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
805 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
806 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
807 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
808 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
809 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
810 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
811 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
812 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
814 fjx0 = _mm256_setzero_ps();
815 fjy0 = _mm256_setzero_ps();
816 fjz0 = _mm256_setzero_ps();
817 fjx1 = _mm256_setzero_ps();
818 fjy1 = _mm256_setzero_ps();
819 fjz1 = _mm256_setzero_ps();
820 fjx2 = _mm256_setzero_ps();
821 fjy2 = _mm256_setzero_ps();
822 fjz2 = _mm256_setzero_ps();
824 /**************************
825 * CALCULATE INTERACTIONS *
826 **************************/
828 if (gmx_mm256_any_lt(rsq00,rcutoff2))
831 r00 = _mm256_mul_ps(rsq00,rinv00);
832 r00 = _mm256_andnot_ps(dummy_mask,r00);
834 /* Calculate table index by multiplying r with table scale and truncate to integer */
835 rt = _mm256_mul_ps(r00,vftabscale);
836 vfitab = _mm256_cvttps_epi32(rt);
837 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
838 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
839 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
840 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
841 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
842 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
844 /* REACTION-FIELD ELECTROSTATICS */
845 velec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
846 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
848 /* CUBIC SPLINE TABLE DISPERSION */
849 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
850 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
851 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
852 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
853 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
854 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
855 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
856 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
857 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
858 Heps = _mm256_mul_ps(vfeps,H);
859 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
860 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
861 vvdw6 = _mm256_mul_ps(c6_00,VV);
862 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
863 fvdw6 = _mm256_mul_ps(c6_00,FF);
865 /* CUBIC SPLINE TABLE REPULSION */
866 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
867 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
868 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
869 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
870 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
871 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
872 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
873 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
874 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
875 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
876 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
877 Heps = _mm256_mul_ps(vfeps,H);
878 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
879 VV = _mm256_add_ps(Y,_mm256_mul_ps(vfeps,Fp));
880 vvdw12 = _mm256_mul_ps(c12_00,VV);
881 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
882 fvdw12 = _mm256_mul_ps(c12_00,FF);
883 vvdw = _mm256_add_ps(vvdw12,vvdw6);
884 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
886 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
888 /* Update potential sum for this i atom from the interaction with this j atom. */
889 velec = _mm256_and_ps(velec,cutoff_mask);
890 velec = _mm256_andnot_ps(dummy_mask,velec);
891 velecsum = _mm256_add_ps(velecsum,velec);
892 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
893 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
894 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
896 fscal = _mm256_add_ps(felec,fvdw);
898 fscal = _mm256_and_ps(fscal,cutoff_mask);
900 fscal = _mm256_andnot_ps(dummy_mask,fscal);
902 /* Calculate temporary vectorial force */
903 tx = _mm256_mul_ps(fscal,dx00);
904 ty = _mm256_mul_ps(fscal,dy00);
905 tz = _mm256_mul_ps(fscal,dz00);
907 /* Update vectorial force */
908 fix0 = _mm256_add_ps(fix0,tx);
909 fiy0 = _mm256_add_ps(fiy0,ty);
910 fiz0 = _mm256_add_ps(fiz0,tz);
912 fjx0 = _mm256_add_ps(fjx0,tx);
913 fjy0 = _mm256_add_ps(fjy0,ty);
914 fjz0 = _mm256_add_ps(fjz0,tz);
918 /**************************
919 * CALCULATE INTERACTIONS *
920 **************************/
922 if (gmx_mm256_any_lt(rsq01,rcutoff2))
925 /* REACTION-FIELD ELECTROSTATICS */
926 velec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_add_ps(rinv01,_mm256_mul_ps(krf,rsq01)),crf));
927 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
929 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
931 /* Update potential sum for this i atom from the interaction with this j atom. */
932 velec = _mm256_and_ps(velec,cutoff_mask);
933 velec = _mm256_andnot_ps(dummy_mask,velec);
934 velecsum = _mm256_add_ps(velecsum,velec);
938 fscal = _mm256_and_ps(fscal,cutoff_mask);
940 fscal = _mm256_andnot_ps(dummy_mask,fscal);
942 /* Calculate temporary vectorial force */
943 tx = _mm256_mul_ps(fscal,dx01);
944 ty = _mm256_mul_ps(fscal,dy01);
945 tz = _mm256_mul_ps(fscal,dz01);
947 /* Update vectorial force */
948 fix0 = _mm256_add_ps(fix0,tx);
949 fiy0 = _mm256_add_ps(fiy0,ty);
950 fiz0 = _mm256_add_ps(fiz0,tz);
952 fjx1 = _mm256_add_ps(fjx1,tx);
953 fjy1 = _mm256_add_ps(fjy1,ty);
954 fjz1 = _mm256_add_ps(fjz1,tz);
958 /**************************
959 * CALCULATE INTERACTIONS *
960 **************************/
962 if (gmx_mm256_any_lt(rsq02,rcutoff2))
965 /* REACTION-FIELD ELECTROSTATICS */
966 velec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_add_ps(rinv02,_mm256_mul_ps(krf,rsq02)),crf));
967 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
969 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
971 /* Update potential sum for this i atom from the interaction with this j atom. */
972 velec = _mm256_and_ps(velec,cutoff_mask);
973 velec = _mm256_andnot_ps(dummy_mask,velec);
974 velecsum = _mm256_add_ps(velecsum,velec);
978 fscal = _mm256_and_ps(fscal,cutoff_mask);
980 fscal = _mm256_andnot_ps(dummy_mask,fscal);
982 /* Calculate temporary vectorial force */
983 tx = _mm256_mul_ps(fscal,dx02);
984 ty = _mm256_mul_ps(fscal,dy02);
985 tz = _mm256_mul_ps(fscal,dz02);
987 /* Update vectorial force */
988 fix0 = _mm256_add_ps(fix0,tx);
989 fiy0 = _mm256_add_ps(fiy0,ty);
990 fiz0 = _mm256_add_ps(fiz0,tz);
992 fjx2 = _mm256_add_ps(fjx2,tx);
993 fjy2 = _mm256_add_ps(fjy2,ty);
994 fjz2 = _mm256_add_ps(fjz2,tz);
998 /**************************
999 * CALCULATE INTERACTIONS *
1000 **************************/
1002 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1005 /* REACTION-FIELD ELECTROSTATICS */
1006 velec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
1007 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1009 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1011 /* Update potential sum for this i atom from the interaction with this j atom. */
1012 velec = _mm256_and_ps(velec,cutoff_mask);
1013 velec = _mm256_andnot_ps(dummy_mask,velec);
1014 velecsum = _mm256_add_ps(velecsum,velec);
1018 fscal = _mm256_and_ps(fscal,cutoff_mask);
1020 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1022 /* Calculate temporary vectorial force */
1023 tx = _mm256_mul_ps(fscal,dx10);
1024 ty = _mm256_mul_ps(fscal,dy10);
1025 tz = _mm256_mul_ps(fscal,dz10);
1027 /* Update vectorial force */
1028 fix1 = _mm256_add_ps(fix1,tx);
1029 fiy1 = _mm256_add_ps(fiy1,ty);
1030 fiz1 = _mm256_add_ps(fiz1,tz);
1032 fjx0 = _mm256_add_ps(fjx0,tx);
1033 fjy0 = _mm256_add_ps(fjy0,ty);
1034 fjz0 = _mm256_add_ps(fjz0,tz);
1038 /**************************
1039 * CALCULATE INTERACTIONS *
1040 **************************/
1042 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1045 /* REACTION-FIELD ELECTROSTATICS */
1046 velec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_add_ps(rinv11,_mm256_mul_ps(krf,rsq11)),crf));
1047 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1049 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1051 /* Update potential sum for this i atom from the interaction with this j atom. */
1052 velec = _mm256_and_ps(velec,cutoff_mask);
1053 velec = _mm256_andnot_ps(dummy_mask,velec);
1054 velecsum = _mm256_add_ps(velecsum,velec);
1058 fscal = _mm256_and_ps(fscal,cutoff_mask);
1060 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1062 /* Calculate temporary vectorial force */
1063 tx = _mm256_mul_ps(fscal,dx11);
1064 ty = _mm256_mul_ps(fscal,dy11);
1065 tz = _mm256_mul_ps(fscal,dz11);
1067 /* Update vectorial force */
1068 fix1 = _mm256_add_ps(fix1,tx);
1069 fiy1 = _mm256_add_ps(fiy1,ty);
1070 fiz1 = _mm256_add_ps(fiz1,tz);
1072 fjx1 = _mm256_add_ps(fjx1,tx);
1073 fjy1 = _mm256_add_ps(fjy1,ty);
1074 fjz1 = _mm256_add_ps(fjz1,tz);
1078 /**************************
1079 * CALCULATE INTERACTIONS *
1080 **************************/
1082 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1085 /* REACTION-FIELD ELECTROSTATICS */
1086 velec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_add_ps(rinv12,_mm256_mul_ps(krf,rsq12)),crf));
1087 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1089 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1091 /* Update potential sum for this i atom from the interaction with this j atom. */
1092 velec = _mm256_and_ps(velec,cutoff_mask);
1093 velec = _mm256_andnot_ps(dummy_mask,velec);
1094 velecsum = _mm256_add_ps(velecsum,velec);
1098 fscal = _mm256_and_ps(fscal,cutoff_mask);
1100 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1102 /* Calculate temporary vectorial force */
1103 tx = _mm256_mul_ps(fscal,dx12);
1104 ty = _mm256_mul_ps(fscal,dy12);
1105 tz = _mm256_mul_ps(fscal,dz12);
1107 /* Update vectorial force */
1108 fix1 = _mm256_add_ps(fix1,tx);
1109 fiy1 = _mm256_add_ps(fiy1,ty);
1110 fiz1 = _mm256_add_ps(fiz1,tz);
1112 fjx2 = _mm256_add_ps(fjx2,tx);
1113 fjy2 = _mm256_add_ps(fjy2,ty);
1114 fjz2 = _mm256_add_ps(fjz2,tz);
1118 /**************************
1119 * CALCULATE INTERACTIONS *
1120 **************************/
1122 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1125 /* REACTION-FIELD ELECTROSTATICS */
1126 velec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
1127 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1129 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1131 /* Update potential sum for this i atom from the interaction with this j atom. */
1132 velec = _mm256_and_ps(velec,cutoff_mask);
1133 velec = _mm256_andnot_ps(dummy_mask,velec);
1134 velecsum = _mm256_add_ps(velecsum,velec);
1138 fscal = _mm256_and_ps(fscal,cutoff_mask);
1140 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1142 /* Calculate temporary vectorial force */
1143 tx = _mm256_mul_ps(fscal,dx20);
1144 ty = _mm256_mul_ps(fscal,dy20);
1145 tz = _mm256_mul_ps(fscal,dz20);
1147 /* Update vectorial force */
1148 fix2 = _mm256_add_ps(fix2,tx);
1149 fiy2 = _mm256_add_ps(fiy2,ty);
1150 fiz2 = _mm256_add_ps(fiz2,tz);
1152 fjx0 = _mm256_add_ps(fjx0,tx);
1153 fjy0 = _mm256_add_ps(fjy0,ty);
1154 fjz0 = _mm256_add_ps(fjz0,tz);
1158 /**************************
1159 * CALCULATE INTERACTIONS *
1160 **************************/
1162 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1165 /* REACTION-FIELD ELECTROSTATICS */
1166 velec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_add_ps(rinv21,_mm256_mul_ps(krf,rsq21)),crf));
1167 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1169 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1171 /* Update potential sum for this i atom from the interaction with this j atom. */
1172 velec = _mm256_and_ps(velec,cutoff_mask);
1173 velec = _mm256_andnot_ps(dummy_mask,velec);
1174 velecsum = _mm256_add_ps(velecsum,velec);
1178 fscal = _mm256_and_ps(fscal,cutoff_mask);
1180 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1182 /* Calculate temporary vectorial force */
1183 tx = _mm256_mul_ps(fscal,dx21);
1184 ty = _mm256_mul_ps(fscal,dy21);
1185 tz = _mm256_mul_ps(fscal,dz21);
1187 /* Update vectorial force */
1188 fix2 = _mm256_add_ps(fix2,tx);
1189 fiy2 = _mm256_add_ps(fiy2,ty);
1190 fiz2 = _mm256_add_ps(fiz2,tz);
1192 fjx1 = _mm256_add_ps(fjx1,tx);
1193 fjy1 = _mm256_add_ps(fjy1,ty);
1194 fjz1 = _mm256_add_ps(fjz1,tz);
1198 /**************************
1199 * CALCULATE INTERACTIONS *
1200 **************************/
1202 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1205 /* REACTION-FIELD ELECTROSTATICS */
1206 velec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_add_ps(rinv22,_mm256_mul_ps(krf,rsq22)),crf));
1207 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1209 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1211 /* Update potential sum for this i atom from the interaction with this j atom. */
1212 velec = _mm256_and_ps(velec,cutoff_mask);
1213 velec = _mm256_andnot_ps(dummy_mask,velec);
1214 velecsum = _mm256_add_ps(velecsum,velec);
1218 fscal = _mm256_and_ps(fscal,cutoff_mask);
1220 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1222 /* Calculate temporary vectorial force */
1223 tx = _mm256_mul_ps(fscal,dx22);
1224 ty = _mm256_mul_ps(fscal,dy22);
1225 tz = _mm256_mul_ps(fscal,dz22);
1227 /* Update vectorial force */
1228 fix2 = _mm256_add_ps(fix2,tx);
1229 fiy2 = _mm256_add_ps(fiy2,ty);
1230 fiz2 = _mm256_add_ps(fiz2,tz);
1232 fjx2 = _mm256_add_ps(fjx2,tx);
1233 fjy2 = _mm256_add_ps(fjy2,ty);
1234 fjz2 = _mm256_add_ps(fjz2,tz);
1238 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1239 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1240 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1241 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1242 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1243 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1244 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1245 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1247 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1248 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1250 /* Inner loop uses 361 flops */
1253 /* End of innermost loop */
1255 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1256 f+i_coord_offset,fshift+i_shift_offset);
1259 /* Update potential energies */
1260 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
1261 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
1263 /* Increment number of inner iterations */
1264 inneriter += j_index_end - j_index_start;
1266 /* Outer loop uses 20 flops */
1269 /* Increment number of outer iterations */
1272 /* Update outer/inner flops */
1274 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*361);
1277 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_256_single
1278 * Electrostatics interaction: ReactionField
1279 * VdW interaction: CubicSplineTable
1280 * Geometry: Water3-Water3
1281 * Calculate force/pot: Force
1284 nb_kernel_ElecRFCut_VdwCSTab_GeomW3W3_F_avx_256_single
1285 (t_nblist * gmx_restrict nlist,
1286 rvec * gmx_restrict xx,
1287 rvec * gmx_restrict ff,
1288 t_forcerec * gmx_restrict fr,
1289 t_mdatoms * gmx_restrict mdatoms,
1290 nb_kernel_data_t * gmx_restrict kernel_data,
1291 t_nrnb * gmx_restrict nrnb)
1293 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1294 * just 0 for non-waters.
1295 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
1296 * jnr indices corresponding to data put in the four positions in the SIMD register.
1298 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1299 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1300 int jnrA,jnrB,jnrC,jnrD;
1301 int jnrE,jnrF,jnrG,jnrH;
1302 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1303 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1304 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1305 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
1306 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1307 real rcutoff_scalar;
1308 real *shiftvec,*fshift,*x,*f;
1309 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
1310 real scratch[4*DIM];
1311 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1312 real * vdwioffsetptr0;
1313 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1314 real * vdwioffsetptr1;
1315 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1316 real * vdwioffsetptr2;
1317 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1318 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
1319 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1320 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D,vdwjidx1E,vdwjidx1F,vdwjidx1G,vdwjidx1H;
1321 __m256 jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1322 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D,vdwjidx2E,vdwjidx2F,vdwjidx2G,vdwjidx2H;
1323 __m256 jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1324 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1325 __m256 dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1326 __m256 dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1327 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1328 __m256 dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1329 __m256 dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1330 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1331 __m256 dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1332 __m256 dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1333 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
1336 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1339 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
1340 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
1342 __m128i vfitab_lo,vfitab_hi;
1343 __m128i ifour = _mm_set1_epi32(4);
1344 __m256 rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
1346 __m256 dummy_mask,cutoff_mask;
1347 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
1348 __m256 one = _mm256_set1_ps(1.0);
1349 __m256 two = _mm256_set1_ps(2.0);
1355 jindex = nlist->jindex;
1357 shiftidx = nlist->shift;
1359 shiftvec = fr->shift_vec[0];
1360 fshift = fr->fshift[0];
1361 facel = _mm256_set1_ps(fr->epsfac);
1362 charge = mdatoms->chargeA;
1363 krf = _mm256_set1_ps(fr->ic->k_rf);
1364 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
1365 crf = _mm256_set1_ps(fr->ic->c_rf);
1366 nvdwtype = fr->ntype;
1367 vdwparam = fr->nbfp;
1368 vdwtype = mdatoms->typeA;
1370 vftab = kernel_data->table_vdw->data;
1371 vftabscale = _mm256_set1_ps(kernel_data->table_vdw->scale);
1373 /* Setup water-specific parameters */
1374 inr = nlist->iinr[0];
1375 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
1376 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
1377 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
1378 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1380 jq0 = _mm256_set1_ps(charge[inr+0]);
1381 jq1 = _mm256_set1_ps(charge[inr+1]);
1382 jq2 = _mm256_set1_ps(charge[inr+2]);
1383 vdwjidx0A = 2*vdwtype[inr+0];
1384 qq00 = _mm256_mul_ps(iq0,jq0);
1385 c6_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A]);
1386 c12_00 = _mm256_set1_ps(vdwioffsetptr0[vdwjidx0A+1]);
1387 qq01 = _mm256_mul_ps(iq0,jq1);
1388 qq02 = _mm256_mul_ps(iq0,jq2);
1389 qq10 = _mm256_mul_ps(iq1,jq0);
1390 qq11 = _mm256_mul_ps(iq1,jq1);
1391 qq12 = _mm256_mul_ps(iq1,jq2);
1392 qq20 = _mm256_mul_ps(iq2,jq0);
1393 qq21 = _mm256_mul_ps(iq2,jq1);
1394 qq22 = _mm256_mul_ps(iq2,jq2);
1396 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
1397 rcutoff_scalar = fr->rcoulomb;
1398 rcutoff = _mm256_set1_ps(rcutoff_scalar);
1399 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
1401 /* Avoid stupid compiler warnings */
1402 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
1403 j_coord_offsetA = 0;
1404 j_coord_offsetB = 0;
1405 j_coord_offsetC = 0;
1406 j_coord_offsetD = 0;
1407 j_coord_offsetE = 0;
1408 j_coord_offsetF = 0;
1409 j_coord_offsetG = 0;
1410 j_coord_offsetH = 0;
1415 for(iidx=0;iidx<4*DIM;iidx++)
1417 scratch[iidx] = 0.0;
1420 /* Start outer loop over neighborlists */
1421 for(iidx=0; iidx<nri; iidx++)
1423 /* Load shift vector for this list */
1424 i_shift_offset = DIM*shiftidx[iidx];
1426 /* Load limits for loop over neighbors */
1427 j_index_start = jindex[iidx];
1428 j_index_end = jindex[iidx+1];
1430 /* Get outer coordinate index */
1432 i_coord_offset = DIM*inr;
1434 /* Load i particle coords and add shift vector */
1435 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
1436 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1438 fix0 = _mm256_setzero_ps();
1439 fiy0 = _mm256_setzero_ps();
1440 fiz0 = _mm256_setzero_ps();
1441 fix1 = _mm256_setzero_ps();
1442 fiy1 = _mm256_setzero_ps();
1443 fiz1 = _mm256_setzero_ps();
1444 fix2 = _mm256_setzero_ps();
1445 fiy2 = _mm256_setzero_ps();
1446 fiz2 = _mm256_setzero_ps();
1448 /* Start inner kernel loop */
1449 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
1452 /* Get j neighbor index, and coordinate index */
1454 jnrB = jjnr[jidx+1];
1455 jnrC = jjnr[jidx+2];
1456 jnrD = jjnr[jidx+3];
1457 jnrE = jjnr[jidx+4];
1458 jnrF = jjnr[jidx+5];
1459 jnrG = jjnr[jidx+6];
1460 jnrH = jjnr[jidx+7];
1461 j_coord_offsetA = DIM*jnrA;
1462 j_coord_offsetB = DIM*jnrB;
1463 j_coord_offsetC = DIM*jnrC;
1464 j_coord_offsetD = DIM*jnrD;
1465 j_coord_offsetE = DIM*jnrE;
1466 j_coord_offsetF = DIM*jnrF;
1467 j_coord_offsetG = DIM*jnrG;
1468 j_coord_offsetH = DIM*jnrH;
1470 /* load j atom coordinates */
1471 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1472 x+j_coord_offsetC,x+j_coord_offsetD,
1473 x+j_coord_offsetE,x+j_coord_offsetF,
1474 x+j_coord_offsetG,x+j_coord_offsetH,
1475 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1477 /* Calculate displacement vector */
1478 dx00 = _mm256_sub_ps(ix0,jx0);
1479 dy00 = _mm256_sub_ps(iy0,jy0);
1480 dz00 = _mm256_sub_ps(iz0,jz0);
1481 dx01 = _mm256_sub_ps(ix0,jx1);
1482 dy01 = _mm256_sub_ps(iy0,jy1);
1483 dz01 = _mm256_sub_ps(iz0,jz1);
1484 dx02 = _mm256_sub_ps(ix0,jx2);
1485 dy02 = _mm256_sub_ps(iy0,jy2);
1486 dz02 = _mm256_sub_ps(iz0,jz2);
1487 dx10 = _mm256_sub_ps(ix1,jx0);
1488 dy10 = _mm256_sub_ps(iy1,jy0);
1489 dz10 = _mm256_sub_ps(iz1,jz0);
1490 dx11 = _mm256_sub_ps(ix1,jx1);
1491 dy11 = _mm256_sub_ps(iy1,jy1);
1492 dz11 = _mm256_sub_ps(iz1,jz1);
1493 dx12 = _mm256_sub_ps(ix1,jx2);
1494 dy12 = _mm256_sub_ps(iy1,jy2);
1495 dz12 = _mm256_sub_ps(iz1,jz2);
1496 dx20 = _mm256_sub_ps(ix2,jx0);
1497 dy20 = _mm256_sub_ps(iy2,jy0);
1498 dz20 = _mm256_sub_ps(iz2,jz0);
1499 dx21 = _mm256_sub_ps(ix2,jx1);
1500 dy21 = _mm256_sub_ps(iy2,jy1);
1501 dz21 = _mm256_sub_ps(iz2,jz1);
1502 dx22 = _mm256_sub_ps(ix2,jx2);
1503 dy22 = _mm256_sub_ps(iy2,jy2);
1504 dz22 = _mm256_sub_ps(iz2,jz2);
1506 /* Calculate squared distance and things based on it */
1507 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1508 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1509 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1510 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1511 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1512 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1513 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1514 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1515 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1517 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1518 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1519 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1520 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1521 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1522 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1523 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1524 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1525 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1527 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1528 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1529 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1530 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1531 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1532 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1533 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1534 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1535 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1537 fjx0 = _mm256_setzero_ps();
1538 fjy0 = _mm256_setzero_ps();
1539 fjz0 = _mm256_setzero_ps();
1540 fjx1 = _mm256_setzero_ps();
1541 fjy1 = _mm256_setzero_ps();
1542 fjz1 = _mm256_setzero_ps();
1543 fjx2 = _mm256_setzero_ps();
1544 fjy2 = _mm256_setzero_ps();
1545 fjz2 = _mm256_setzero_ps();
1547 /**************************
1548 * CALCULATE INTERACTIONS *
1549 **************************/
1551 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1554 r00 = _mm256_mul_ps(rsq00,rinv00);
1556 /* Calculate table index by multiplying r with table scale and truncate to integer */
1557 rt = _mm256_mul_ps(r00,vftabscale);
1558 vfitab = _mm256_cvttps_epi32(rt);
1559 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
1560 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
1561 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
1562 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
1563 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
1564 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
1566 /* REACTION-FIELD ELECTROSTATICS */
1567 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1569 /* CUBIC SPLINE TABLE DISPERSION */
1570 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1571 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1572 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1573 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1574 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1575 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1576 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1577 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1578 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1579 Heps = _mm256_mul_ps(vfeps,H);
1580 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1581 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1582 fvdw6 = _mm256_mul_ps(c6_00,FF);
1584 /* CUBIC SPLINE TABLE REPULSION */
1585 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
1586 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
1587 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
1588 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
1589 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
1590 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
1591 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
1592 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
1593 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
1594 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
1595 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
1596 Heps = _mm256_mul_ps(vfeps,H);
1597 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
1598 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
1599 fvdw12 = _mm256_mul_ps(c12_00,FF);
1600 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
1602 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1604 fscal = _mm256_add_ps(felec,fvdw);
1606 fscal = _mm256_and_ps(fscal,cutoff_mask);
1608 /* Calculate temporary vectorial force */
1609 tx = _mm256_mul_ps(fscal,dx00);
1610 ty = _mm256_mul_ps(fscal,dy00);
1611 tz = _mm256_mul_ps(fscal,dz00);
1613 /* Update vectorial force */
1614 fix0 = _mm256_add_ps(fix0,tx);
1615 fiy0 = _mm256_add_ps(fiy0,ty);
1616 fiz0 = _mm256_add_ps(fiz0,tz);
1618 fjx0 = _mm256_add_ps(fjx0,tx);
1619 fjy0 = _mm256_add_ps(fjy0,ty);
1620 fjz0 = _mm256_add_ps(fjz0,tz);
1624 /**************************
1625 * CALCULATE INTERACTIONS *
1626 **************************/
1628 if (gmx_mm256_any_lt(rsq01,rcutoff2))
1631 /* REACTION-FIELD ELECTROSTATICS */
1632 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
1634 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
1638 fscal = _mm256_and_ps(fscal,cutoff_mask);
1640 /* Calculate temporary vectorial force */
1641 tx = _mm256_mul_ps(fscal,dx01);
1642 ty = _mm256_mul_ps(fscal,dy01);
1643 tz = _mm256_mul_ps(fscal,dz01);
1645 /* Update vectorial force */
1646 fix0 = _mm256_add_ps(fix0,tx);
1647 fiy0 = _mm256_add_ps(fiy0,ty);
1648 fiz0 = _mm256_add_ps(fiz0,tz);
1650 fjx1 = _mm256_add_ps(fjx1,tx);
1651 fjy1 = _mm256_add_ps(fjy1,ty);
1652 fjz1 = _mm256_add_ps(fjz1,tz);
1656 /**************************
1657 * CALCULATE INTERACTIONS *
1658 **************************/
1660 if (gmx_mm256_any_lt(rsq02,rcutoff2))
1663 /* REACTION-FIELD ELECTROSTATICS */
1664 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
1666 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
1670 fscal = _mm256_and_ps(fscal,cutoff_mask);
1672 /* Calculate temporary vectorial force */
1673 tx = _mm256_mul_ps(fscal,dx02);
1674 ty = _mm256_mul_ps(fscal,dy02);
1675 tz = _mm256_mul_ps(fscal,dz02);
1677 /* Update vectorial force */
1678 fix0 = _mm256_add_ps(fix0,tx);
1679 fiy0 = _mm256_add_ps(fiy0,ty);
1680 fiz0 = _mm256_add_ps(fiz0,tz);
1682 fjx2 = _mm256_add_ps(fjx2,tx);
1683 fjy2 = _mm256_add_ps(fjy2,ty);
1684 fjz2 = _mm256_add_ps(fjz2,tz);
1688 /**************************
1689 * CALCULATE INTERACTIONS *
1690 **************************/
1692 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1695 /* REACTION-FIELD ELECTROSTATICS */
1696 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1698 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1702 fscal = _mm256_and_ps(fscal,cutoff_mask);
1704 /* Calculate temporary vectorial force */
1705 tx = _mm256_mul_ps(fscal,dx10);
1706 ty = _mm256_mul_ps(fscal,dy10);
1707 tz = _mm256_mul_ps(fscal,dz10);
1709 /* Update vectorial force */
1710 fix1 = _mm256_add_ps(fix1,tx);
1711 fiy1 = _mm256_add_ps(fiy1,ty);
1712 fiz1 = _mm256_add_ps(fiz1,tz);
1714 fjx0 = _mm256_add_ps(fjx0,tx);
1715 fjy0 = _mm256_add_ps(fjy0,ty);
1716 fjz0 = _mm256_add_ps(fjz0,tz);
1720 /**************************
1721 * CALCULATE INTERACTIONS *
1722 **************************/
1724 if (gmx_mm256_any_lt(rsq11,rcutoff2))
1727 /* REACTION-FIELD ELECTROSTATICS */
1728 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
1730 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
1734 fscal = _mm256_and_ps(fscal,cutoff_mask);
1736 /* Calculate temporary vectorial force */
1737 tx = _mm256_mul_ps(fscal,dx11);
1738 ty = _mm256_mul_ps(fscal,dy11);
1739 tz = _mm256_mul_ps(fscal,dz11);
1741 /* Update vectorial force */
1742 fix1 = _mm256_add_ps(fix1,tx);
1743 fiy1 = _mm256_add_ps(fiy1,ty);
1744 fiz1 = _mm256_add_ps(fiz1,tz);
1746 fjx1 = _mm256_add_ps(fjx1,tx);
1747 fjy1 = _mm256_add_ps(fjy1,ty);
1748 fjz1 = _mm256_add_ps(fjz1,tz);
1752 /**************************
1753 * CALCULATE INTERACTIONS *
1754 **************************/
1756 if (gmx_mm256_any_lt(rsq12,rcutoff2))
1759 /* REACTION-FIELD ELECTROSTATICS */
1760 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
1762 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
1766 fscal = _mm256_and_ps(fscal,cutoff_mask);
1768 /* Calculate temporary vectorial force */
1769 tx = _mm256_mul_ps(fscal,dx12);
1770 ty = _mm256_mul_ps(fscal,dy12);
1771 tz = _mm256_mul_ps(fscal,dz12);
1773 /* Update vectorial force */
1774 fix1 = _mm256_add_ps(fix1,tx);
1775 fiy1 = _mm256_add_ps(fiy1,ty);
1776 fiz1 = _mm256_add_ps(fiz1,tz);
1778 fjx2 = _mm256_add_ps(fjx2,tx);
1779 fjy2 = _mm256_add_ps(fjy2,ty);
1780 fjz2 = _mm256_add_ps(fjz2,tz);
1784 /**************************
1785 * CALCULATE INTERACTIONS *
1786 **************************/
1788 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1791 /* REACTION-FIELD ELECTROSTATICS */
1792 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1794 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1798 fscal = _mm256_and_ps(fscal,cutoff_mask);
1800 /* Calculate temporary vectorial force */
1801 tx = _mm256_mul_ps(fscal,dx20);
1802 ty = _mm256_mul_ps(fscal,dy20);
1803 tz = _mm256_mul_ps(fscal,dz20);
1805 /* Update vectorial force */
1806 fix2 = _mm256_add_ps(fix2,tx);
1807 fiy2 = _mm256_add_ps(fiy2,ty);
1808 fiz2 = _mm256_add_ps(fiz2,tz);
1810 fjx0 = _mm256_add_ps(fjx0,tx);
1811 fjy0 = _mm256_add_ps(fjy0,ty);
1812 fjz0 = _mm256_add_ps(fjz0,tz);
1816 /**************************
1817 * CALCULATE INTERACTIONS *
1818 **************************/
1820 if (gmx_mm256_any_lt(rsq21,rcutoff2))
1823 /* REACTION-FIELD ELECTROSTATICS */
1824 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
1826 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
1830 fscal = _mm256_and_ps(fscal,cutoff_mask);
1832 /* Calculate temporary vectorial force */
1833 tx = _mm256_mul_ps(fscal,dx21);
1834 ty = _mm256_mul_ps(fscal,dy21);
1835 tz = _mm256_mul_ps(fscal,dz21);
1837 /* Update vectorial force */
1838 fix2 = _mm256_add_ps(fix2,tx);
1839 fiy2 = _mm256_add_ps(fiy2,ty);
1840 fiz2 = _mm256_add_ps(fiz2,tz);
1842 fjx1 = _mm256_add_ps(fjx1,tx);
1843 fjy1 = _mm256_add_ps(fjy1,ty);
1844 fjz1 = _mm256_add_ps(fjz1,tz);
1848 /**************************
1849 * CALCULATE INTERACTIONS *
1850 **************************/
1852 if (gmx_mm256_any_lt(rsq22,rcutoff2))
1855 /* REACTION-FIELD ELECTROSTATICS */
1856 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
1858 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
1862 fscal = _mm256_and_ps(fscal,cutoff_mask);
1864 /* Calculate temporary vectorial force */
1865 tx = _mm256_mul_ps(fscal,dx22);
1866 ty = _mm256_mul_ps(fscal,dy22);
1867 tz = _mm256_mul_ps(fscal,dz22);
1869 /* Update vectorial force */
1870 fix2 = _mm256_add_ps(fix2,tx);
1871 fiy2 = _mm256_add_ps(fiy2,ty);
1872 fiz2 = _mm256_add_ps(fiz2,tz);
1874 fjx2 = _mm256_add_ps(fjx2,tx);
1875 fjy2 = _mm256_add_ps(fjy2,ty);
1876 fjz2 = _mm256_add_ps(fjz2,tz);
1880 fjptrA = f+j_coord_offsetA;
1881 fjptrB = f+j_coord_offsetB;
1882 fjptrC = f+j_coord_offsetC;
1883 fjptrD = f+j_coord_offsetD;
1884 fjptrE = f+j_coord_offsetE;
1885 fjptrF = f+j_coord_offsetF;
1886 fjptrG = f+j_coord_offsetG;
1887 fjptrH = f+j_coord_offsetH;
1889 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
1890 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1892 /* Inner loop uses 297 flops */
1895 if(jidx<j_index_end)
1898 /* Get j neighbor index, and coordinate index */
1899 jnrlistA = jjnr[jidx];
1900 jnrlistB = jjnr[jidx+1];
1901 jnrlistC = jjnr[jidx+2];
1902 jnrlistD = jjnr[jidx+3];
1903 jnrlistE = jjnr[jidx+4];
1904 jnrlistF = jjnr[jidx+5];
1905 jnrlistG = jjnr[jidx+6];
1906 jnrlistH = jjnr[jidx+7];
1907 /* Sign of each element will be negative for non-real atoms.
1908 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1909 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1911 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1912 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1914 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1915 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1916 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1917 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1918 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1919 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1920 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1921 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1922 j_coord_offsetA = DIM*jnrA;
1923 j_coord_offsetB = DIM*jnrB;
1924 j_coord_offsetC = DIM*jnrC;
1925 j_coord_offsetD = DIM*jnrD;
1926 j_coord_offsetE = DIM*jnrE;
1927 j_coord_offsetF = DIM*jnrF;
1928 j_coord_offsetG = DIM*jnrG;
1929 j_coord_offsetH = DIM*jnrH;
1931 /* load j atom coordinates */
1932 gmx_mm256_load_3rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1933 x+j_coord_offsetC,x+j_coord_offsetD,
1934 x+j_coord_offsetE,x+j_coord_offsetF,
1935 x+j_coord_offsetG,x+j_coord_offsetH,
1936 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1938 /* Calculate displacement vector */
1939 dx00 = _mm256_sub_ps(ix0,jx0);
1940 dy00 = _mm256_sub_ps(iy0,jy0);
1941 dz00 = _mm256_sub_ps(iz0,jz0);
1942 dx01 = _mm256_sub_ps(ix0,jx1);
1943 dy01 = _mm256_sub_ps(iy0,jy1);
1944 dz01 = _mm256_sub_ps(iz0,jz1);
1945 dx02 = _mm256_sub_ps(ix0,jx2);
1946 dy02 = _mm256_sub_ps(iy0,jy2);
1947 dz02 = _mm256_sub_ps(iz0,jz2);
1948 dx10 = _mm256_sub_ps(ix1,jx0);
1949 dy10 = _mm256_sub_ps(iy1,jy0);
1950 dz10 = _mm256_sub_ps(iz1,jz0);
1951 dx11 = _mm256_sub_ps(ix1,jx1);
1952 dy11 = _mm256_sub_ps(iy1,jy1);
1953 dz11 = _mm256_sub_ps(iz1,jz1);
1954 dx12 = _mm256_sub_ps(ix1,jx2);
1955 dy12 = _mm256_sub_ps(iy1,jy2);
1956 dz12 = _mm256_sub_ps(iz1,jz2);
1957 dx20 = _mm256_sub_ps(ix2,jx0);
1958 dy20 = _mm256_sub_ps(iy2,jy0);
1959 dz20 = _mm256_sub_ps(iz2,jz0);
1960 dx21 = _mm256_sub_ps(ix2,jx1);
1961 dy21 = _mm256_sub_ps(iy2,jy1);
1962 dz21 = _mm256_sub_ps(iz2,jz1);
1963 dx22 = _mm256_sub_ps(ix2,jx2);
1964 dy22 = _mm256_sub_ps(iy2,jy2);
1965 dz22 = _mm256_sub_ps(iz2,jz2);
1967 /* Calculate squared distance and things based on it */
1968 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1969 rsq01 = gmx_mm256_calc_rsq_ps(dx01,dy01,dz01);
1970 rsq02 = gmx_mm256_calc_rsq_ps(dx02,dy02,dz02);
1971 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1972 rsq11 = gmx_mm256_calc_rsq_ps(dx11,dy11,dz11);
1973 rsq12 = gmx_mm256_calc_rsq_ps(dx12,dy12,dz12);
1974 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1975 rsq21 = gmx_mm256_calc_rsq_ps(dx21,dy21,dz21);
1976 rsq22 = gmx_mm256_calc_rsq_ps(dx22,dy22,dz22);
1978 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1979 rinv01 = gmx_mm256_invsqrt_ps(rsq01);
1980 rinv02 = gmx_mm256_invsqrt_ps(rsq02);
1981 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1982 rinv11 = gmx_mm256_invsqrt_ps(rsq11);
1983 rinv12 = gmx_mm256_invsqrt_ps(rsq12);
1984 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1985 rinv21 = gmx_mm256_invsqrt_ps(rsq21);
1986 rinv22 = gmx_mm256_invsqrt_ps(rsq22);
1988 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1989 rinvsq01 = _mm256_mul_ps(rinv01,rinv01);
1990 rinvsq02 = _mm256_mul_ps(rinv02,rinv02);
1991 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1992 rinvsq11 = _mm256_mul_ps(rinv11,rinv11);
1993 rinvsq12 = _mm256_mul_ps(rinv12,rinv12);
1994 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1995 rinvsq21 = _mm256_mul_ps(rinv21,rinv21);
1996 rinvsq22 = _mm256_mul_ps(rinv22,rinv22);
1998 fjx0 = _mm256_setzero_ps();
1999 fjy0 = _mm256_setzero_ps();
2000 fjz0 = _mm256_setzero_ps();
2001 fjx1 = _mm256_setzero_ps();
2002 fjy1 = _mm256_setzero_ps();
2003 fjz1 = _mm256_setzero_ps();
2004 fjx2 = _mm256_setzero_ps();
2005 fjy2 = _mm256_setzero_ps();
2006 fjz2 = _mm256_setzero_ps();
2008 /**************************
2009 * CALCULATE INTERACTIONS *
2010 **************************/
2012 if (gmx_mm256_any_lt(rsq00,rcutoff2))
2015 r00 = _mm256_mul_ps(rsq00,rinv00);
2016 r00 = _mm256_andnot_ps(dummy_mask,r00);
2018 /* Calculate table index by multiplying r with table scale and truncate to integer */
2019 rt = _mm256_mul_ps(r00,vftabscale);
2020 vfitab = _mm256_cvttps_epi32(rt);
2021 vfeps = _mm256_sub_ps(rt,_mm256_round_ps(rt, _MM_FROUND_FLOOR));
2022 /* AVX1 does not support 256-bit integer operations, so now we go to 128-bit mode... */
2023 vfitab_lo = _mm256_extractf128_si256(vfitab,0x0);
2024 vfitab_hi = _mm256_extractf128_si256(vfitab,0x1);
2025 vfitab_lo = _mm_slli_epi32(vfitab_lo,3);
2026 vfitab_hi = _mm_slli_epi32(vfitab_hi,3);
2028 /* REACTION-FIELD ELECTROSTATICS */
2029 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
2031 /* CUBIC SPLINE TABLE DISPERSION */
2032 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2033 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2034 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2035 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2036 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2037 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2038 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2039 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2040 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2041 Heps = _mm256_mul_ps(vfeps,H);
2042 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2043 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2044 fvdw6 = _mm256_mul_ps(c6_00,FF);
2046 /* CUBIC SPLINE TABLE REPULSION */
2047 vfitab_lo = _mm_add_epi32(vfitab_lo,ifour);
2048 vfitab_hi = _mm_add_epi32(vfitab_hi,ifour);
2049 Y = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,0)),
2050 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,0)));
2051 F = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,1)),
2052 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,1)));
2053 G = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,2)),
2054 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,2)));
2055 H = gmx_mm256_set_m128(_mm_load_ps(vftab + _mm_extract_epi32(vfitab_hi,3)),
2056 _mm_load_ps(vftab + _mm_extract_epi32(vfitab_lo,3)));
2057 GMX_MM256_HALFTRANSPOSE4_PS(Y,F,G,H);
2058 Heps = _mm256_mul_ps(vfeps,H);
2059 Fp = _mm256_add_ps(F,_mm256_mul_ps(vfeps,_mm256_add_ps(G,Heps)));
2060 FF = _mm256_add_ps(Fp,_mm256_mul_ps(vfeps,_mm256_add_ps(G,_mm256_add_ps(Heps,Heps))));
2061 fvdw12 = _mm256_mul_ps(c12_00,FF);
2062 fvdw = _mm256_xor_ps(signbit,_mm256_mul_ps(_mm256_add_ps(fvdw6,fvdw12),_mm256_mul_ps(vftabscale,rinv00)));
2064 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
2066 fscal = _mm256_add_ps(felec,fvdw);
2068 fscal = _mm256_and_ps(fscal,cutoff_mask);
2070 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2072 /* Calculate temporary vectorial force */
2073 tx = _mm256_mul_ps(fscal,dx00);
2074 ty = _mm256_mul_ps(fscal,dy00);
2075 tz = _mm256_mul_ps(fscal,dz00);
2077 /* Update vectorial force */
2078 fix0 = _mm256_add_ps(fix0,tx);
2079 fiy0 = _mm256_add_ps(fiy0,ty);
2080 fiz0 = _mm256_add_ps(fiz0,tz);
2082 fjx0 = _mm256_add_ps(fjx0,tx);
2083 fjy0 = _mm256_add_ps(fjy0,ty);
2084 fjz0 = _mm256_add_ps(fjz0,tz);
2088 /**************************
2089 * CALCULATE INTERACTIONS *
2090 **************************/
2092 if (gmx_mm256_any_lt(rsq01,rcutoff2))
2095 /* REACTION-FIELD ELECTROSTATICS */
2096 felec = _mm256_mul_ps(qq01,_mm256_sub_ps(_mm256_mul_ps(rinv01,rinvsq01),krf2));
2098 cutoff_mask = _mm256_cmp_ps(rsq01,rcutoff2,_CMP_LT_OQ);
2102 fscal = _mm256_and_ps(fscal,cutoff_mask);
2104 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2106 /* Calculate temporary vectorial force */
2107 tx = _mm256_mul_ps(fscal,dx01);
2108 ty = _mm256_mul_ps(fscal,dy01);
2109 tz = _mm256_mul_ps(fscal,dz01);
2111 /* Update vectorial force */
2112 fix0 = _mm256_add_ps(fix0,tx);
2113 fiy0 = _mm256_add_ps(fiy0,ty);
2114 fiz0 = _mm256_add_ps(fiz0,tz);
2116 fjx1 = _mm256_add_ps(fjx1,tx);
2117 fjy1 = _mm256_add_ps(fjy1,ty);
2118 fjz1 = _mm256_add_ps(fjz1,tz);
2122 /**************************
2123 * CALCULATE INTERACTIONS *
2124 **************************/
2126 if (gmx_mm256_any_lt(rsq02,rcutoff2))
2129 /* REACTION-FIELD ELECTROSTATICS */
2130 felec = _mm256_mul_ps(qq02,_mm256_sub_ps(_mm256_mul_ps(rinv02,rinvsq02),krf2));
2132 cutoff_mask = _mm256_cmp_ps(rsq02,rcutoff2,_CMP_LT_OQ);
2136 fscal = _mm256_and_ps(fscal,cutoff_mask);
2138 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2140 /* Calculate temporary vectorial force */
2141 tx = _mm256_mul_ps(fscal,dx02);
2142 ty = _mm256_mul_ps(fscal,dy02);
2143 tz = _mm256_mul_ps(fscal,dz02);
2145 /* Update vectorial force */
2146 fix0 = _mm256_add_ps(fix0,tx);
2147 fiy0 = _mm256_add_ps(fiy0,ty);
2148 fiz0 = _mm256_add_ps(fiz0,tz);
2150 fjx2 = _mm256_add_ps(fjx2,tx);
2151 fjy2 = _mm256_add_ps(fjy2,ty);
2152 fjz2 = _mm256_add_ps(fjz2,tz);
2156 /**************************
2157 * CALCULATE INTERACTIONS *
2158 **************************/
2160 if (gmx_mm256_any_lt(rsq10,rcutoff2))
2163 /* REACTION-FIELD ELECTROSTATICS */
2164 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
2166 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
2170 fscal = _mm256_and_ps(fscal,cutoff_mask);
2172 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2174 /* Calculate temporary vectorial force */
2175 tx = _mm256_mul_ps(fscal,dx10);
2176 ty = _mm256_mul_ps(fscal,dy10);
2177 tz = _mm256_mul_ps(fscal,dz10);
2179 /* Update vectorial force */
2180 fix1 = _mm256_add_ps(fix1,tx);
2181 fiy1 = _mm256_add_ps(fiy1,ty);
2182 fiz1 = _mm256_add_ps(fiz1,tz);
2184 fjx0 = _mm256_add_ps(fjx0,tx);
2185 fjy0 = _mm256_add_ps(fjy0,ty);
2186 fjz0 = _mm256_add_ps(fjz0,tz);
2190 /**************************
2191 * CALCULATE INTERACTIONS *
2192 **************************/
2194 if (gmx_mm256_any_lt(rsq11,rcutoff2))
2197 /* REACTION-FIELD ELECTROSTATICS */
2198 felec = _mm256_mul_ps(qq11,_mm256_sub_ps(_mm256_mul_ps(rinv11,rinvsq11),krf2));
2200 cutoff_mask = _mm256_cmp_ps(rsq11,rcutoff2,_CMP_LT_OQ);
2204 fscal = _mm256_and_ps(fscal,cutoff_mask);
2206 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2208 /* Calculate temporary vectorial force */
2209 tx = _mm256_mul_ps(fscal,dx11);
2210 ty = _mm256_mul_ps(fscal,dy11);
2211 tz = _mm256_mul_ps(fscal,dz11);
2213 /* Update vectorial force */
2214 fix1 = _mm256_add_ps(fix1,tx);
2215 fiy1 = _mm256_add_ps(fiy1,ty);
2216 fiz1 = _mm256_add_ps(fiz1,tz);
2218 fjx1 = _mm256_add_ps(fjx1,tx);
2219 fjy1 = _mm256_add_ps(fjy1,ty);
2220 fjz1 = _mm256_add_ps(fjz1,tz);
2224 /**************************
2225 * CALCULATE INTERACTIONS *
2226 **************************/
2228 if (gmx_mm256_any_lt(rsq12,rcutoff2))
2231 /* REACTION-FIELD ELECTROSTATICS */
2232 felec = _mm256_mul_ps(qq12,_mm256_sub_ps(_mm256_mul_ps(rinv12,rinvsq12),krf2));
2234 cutoff_mask = _mm256_cmp_ps(rsq12,rcutoff2,_CMP_LT_OQ);
2238 fscal = _mm256_and_ps(fscal,cutoff_mask);
2240 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2242 /* Calculate temporary vectorial force */
2243 tx = _mm256_mul_ps(fscal,dx12);
2244 ty = _mm256_mul_ps(fscal,dy12);
2245 tz = _mm256_mul_ps(fscal,dz12);
2247 /* Update vectorial force */
2248 fix1 = _mm256_add_ps(fix1,tx);
2249 fiy1 = _mm256_add_ps(fiy1,ty);
2250 fiz1 = _mm256_add_ps(fiz1,tz);
2252 fjx2 = _mm256_add_ps(fjx2,tx);
2253 fjy2 = _mm256_add_ps(fjy2,ty);
2254 fjz2 = _mm256_add_ps(fjz2,tz);
2258 /**************************
2259 * CALCULATE INTERACTIONS *
2260 **************************/
2262 if (gmx_mm256_any_lt(rsq20,rcutoff2))
2265 /* REACTION-FIELD ELECTROSTATICS */
2266 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
2268 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
2272 fscal = _mm256_and_ps(fscal,cutoff_mask);
2274 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2276 /* Calculate temporary vectorial force */
2277 tx = _mm256_mul_ps(fscal,dx20);
2278 ty = _mm256_mul_ps(fscal,dy20);
2279 tz = _mm256_mul_ps(fscal,dz20);
2281 /* Update vectorial force */
2282 fix2 = _mm256_add_ps(fix2,tx);
2283 fiy2 = _mm256_add_ps(fiy2,ty);
2284 fiz2 = _mm256_add_ps(fiz2,tz);
2286 fjx0 = _mm256_add_ps(fjx0,tx);
2287 fjy0 = _mm256_add_ps(fjy0,ty);
2288 fjz0 = _mm256_add_ps(fjz0,tz);
2292 /**************************
2293 * CALCULATE INTERACTIONS *
2294 **************************/
2296 if (gmx_mm256_any_lt(rsq21,rcutoff2))
2299 /* REACTION-FIELD ELECTROSTATICS */
2300 felec = _mm256_mul_ps(qq21,_mm256_sub_ps(_mm256_mul_ps(rinv21,rinvsq21),krf2));
2302 cutoff_mask = _mm256_cmp_ps(rsq21,rcutoff2,_CMP_LT_OQ);
2306 fscal = _mm256_and_ps(fscal,cutoff_mask);
2308 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2310 /* Calculate temporary vectorial force */
2311 tx = _mm256_mul_ps(fscal,dx21);
2312 ty = _mm256_mul_ps(fscal,dy21);
2313 tz = _mm256_mul_ps(fscal,dz21);
2315 /* Update vectorial force */
2316 fix2 = _mm256_add_ps(fix2,tx);
2317 fiy2 = _mm256_add_ps(fiy2,ty);
2318 fiz2 = _mm256_add_ps(fiz2,tz);
2320 fjx1 = _mm256_add_ps(fjx1,tx);
2321 fjy1 = _mm256_add_ps(fjy1,ty);
2322 fjz1 = _mm256_add_ps(fjz1,tz);
2326 /**************************
2327 * CALCULATE INTERACTIONS *
2328 **************************/
2330 if (gmx_mm256_any_lt(rsq22,rcutoff2))
2333 /* REACTION-FIELD ELECTROSTATICS */
2334 felec = _mm256_mul_ps(qq22,_mm256_sub_ps(_mm256_mul_ps(rinv22,rinvsq22),krf2));
2336 cutoff_mask = _mm256_cmp_ps(rsq22,rcutoff2,_CMP_LT_OQ);
2340 fscal = _mm256_and_ps(fscal,cutoff_mask);
2342 fscal = _mm256_andnot_ps(dummy_mask,fscal);
2344 /* Calculate temporary vectorial force */
2345 tx = _mm256_mul_ps(fscal,dx22);
2346 ty = _mm256_mul_ps(fscal,dy22);
2347 tz = _mm256_mul_ps(fscal,dz22);
2349 /* Update vectorial force */
2350 fix2 = _mm256_add_ps(fix2,tx);
2351 fiy2 = _mm256_add_ps(fiy2,ty);
2352 fiz2 = _mm256_add_ps(fiz2,tz);
2354 fjx2 = _mm256_add_ps(fjx2,tx);
2355 fjy2 = _mm256_add_ps(fjy2,ty);
2356 fjz2 = _mm256_add_ps(fjz2,tz);
2360 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
2361 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
2362 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
2363 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
2364 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
2365 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
2366 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
2367 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
2369 gmx_mm256_decrement_3rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,
2370 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
2372 /* Inner loop uses 298 flops */
2375 /* End of innermost loop */
2377 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
2378 f+i_coord_offset,fshift+i_shift_offset);
2380 /* Increment number of inner iterations */
2381 inneriter += j_index_end - j_index_start;
2383 /* Outer loop uses 18 flops */
2386 /* Increment number of outer iterations */
2389 /* Update outer/inner flops */
2391 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*298);