2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
79 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
80 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
81 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
84 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
87 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
88 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
90 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
91 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
93 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
94 real rswitch_scalar,d_scalar;
95 __m256d dummy_mask,cutoff_mask;
96 __m128 tmpmask0,tmpmask1;
97 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
98 __m256d one = _mm256_set1_pd(1.0);
99 __m256d two = _mm256_set1_pd(2.0);
105 jindex = nlist->jindex;
107 shiftidx = nlist->shift;
109 shiftvec = fr->shift_vec[0];
110 fshift = fr->fshift[0];
111 facel = _mm256_set1_pd(fr->epsfac);
112 charge = mdatoms->chargeA;
113 nvdwtype = fr->ntype;
115 vdwtype = mdatoms->typeA;
117 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
118 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
119 beta2 = _mm256_mul_pd(beta,beta);
120 beta3 = _mm256_mul_pd(beta,beta2);
122 ewtab = fr->ic->tabq_coul_FDV0;
123 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
124 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
129 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
130 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
131 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
133 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
134 rcutoff_scalar = fr->rcoulomb;
135 rcutoff = _mm256_set1_pd(rcutoff_scalar);
136 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
138 rswitch_scalar = fr->rcoulomb_switch;
139 rswitch = _mm256_set1_pd(rswitch_scalar);
140 /* Setup switch parameters */
141 d_scalar = rcutoff_scalar-rswitch_scalar;
142 d = _mm256_set1_pd(d_scalar);
143 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
144 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
145 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
146 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
147 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
148 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
150 /* Avoid stupid compiler warnings */
151 jnrA = jnrB = jnrC = jnrD = 0;
160 for(iidx=0;iidx<4*DIM;iidx++)
165 /* Start outer loop over neighborlists */
166 for(iidx=0; iidx<nri; iidx++)
168 /* Load shift vector for this list */
169 i_shift_offset = DIM*shiftidx[iidx];
171 /* Load limits for loop over neighbors */
172 j_index_start = jindex[iidx];
173 j_index_end = jindex[iidx+1];
175 /* Get outer coordinate index */
177 i_coord_offset = DIM*inr;
179 /* Load i particle coords and add shift vector */
180 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
181 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
183 fix0 = _mm256_setzero_pd();
184 fiy0 = _mm256_setzero_pd();
185 fiz0 = _mm256_setzero_pd();
186 fix1 = _mm256_setzero_pd();
187 fiy1 = _mm256_setzero_pd();
188 fiz1 = _mm256_setzero_pd();
189 fix2 = _mm256_setzero_pd();
190 fiy2 = _mm256_setzero_pd();
191 fiz2 = _mm256_setzero_pd();
193 /* Reset potential sums */
194 velecsum = _mm256_setzero_pd();
195 vvdwsum = _mm256_setzero_pd();
197 /* Start inner kernel loop */
198 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
201 /* Get j neighbor index, and coordinate index */
206 j_coord_offsetA = DIM*jnrA;
207 j_coord_offsetB = DIM*jnrB;
208 j_coord_offsetC = DIM*jnrC;
209 j_coord_offsetD = DIM*jnrD;
211 /* load j atom coordinates */
212 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
213 x+j_coord_offsetC,x+j_coord_offsetD,
216 /* Calculate displacement vector */
217 dx00 = _mm256_sub_pd(ix0,jx0);
218 dy00 = _mm256_sub_pd(iy0,jy0);
219 dz00 = _mm256_sub_pd(iz0,jz0);
220 dx10 = _mm256_sub_pd(ix1,jx0);
221 dy10 = _mm256_sub_pd(iy1,jy0);
222 dz10 = _mm256_sub_pd(iz1,jz0);
223 dx20 = _mm256_sub_pd(ix2,jx0);
224 dy20 = _mm256_sub_pd(iy2,jy0);
225 dz20 = _mm256_sub_pd(iz2,jz0);
227 /* Calculate squared distance and things based on it */
228 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
229 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
230 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
232 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
233 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
234 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
236 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
237 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
238 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
240 /* Load parameters for j particles */
241 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
242 charge+jnrC+0,charge+jnrD+0);
243 vdwjidx0A = 2*vdwtype[jnrA+0];
244 vdwjidx0B = 2*vdwtype[jnrB+0];
245 vdwjidx0C = 2*vdwtype[jnrC+0];
246 vdwjidx0D = 2*vdwtype[jnrD+0];
248 fjx0 = _mm256_setzero_pd();
249 fjy0 = _mm256_setzero_pd();
250 fjz0 = _mm256_setzero_pd();
252 /**************************
253 * CALCULATE INTERACTIONS *
254 **************************/
256 if (gmx_mm256_any_lt(rsq00,rcutoff2))
259 r00 = _mm256_mul_pd(rsq00,rinv00);
261 /* Compute parameters for interactions between i and j atoms */
262 qq00 = _mm256_mul_pd(iq0,jq0);
263 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
264 vdwioffsetptr0+vdwjidx0B,
265 vdwioffsetptr0+vdwjidx0C,
266 vdwioffsetptr0+vdwjidx0D,
269 /* EWALD ELECTROSTATICS */
271 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
272 ewrt = _mm256_mul_pd(r00,ewtabscale);
273 ewitab = _mm256_cvttpd_epi32(ewrt);
274 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
275 ewitab = _mm_slli_epi32(ewitab,2);
276 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
277 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
278 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
279 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
280 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
281 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
282 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
283 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
284 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
286 /* LENNARD-JONES DISPERSION/REPULSION */
288 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
289 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
290 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
291 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
292 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
294 d = _mm256_sub_pd(r00,rswitch);
295 d = _mm256_max_pd(d,_mm256_setzero_pd());
296 d2 = _mm256_mul_pd(d,d);
297 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
299 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
301 /* Evaluate switch function */
302 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
303 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
304 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
305 velec = _mm256_mul_pd(velec,sw);
306 vvdw = _mm256_mul_pd(vvdw,sw);
307 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
309 /* Update potential sum for this i atom from the interaction with this j atom. */
310 velec = _mm256_and_pd(velec,cutoff_mask);
311 velecsum = _mm256_add_pd(velecsum,velec);
312 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
313 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
315 fscal = _mm256_add_pd(felec,fvdw);
317 fscal = _mm256_and_pd(fscal,cutoff_mask);
319 /* Calculate temporary vectorial force */
320 tx = _mm256_mul_pd(fscal,dx00);
321 ty = _mm256_mul_pd(fscal,dy00);
322 tz = _mm256_mul_pd(fscal,dz00);
324 /* Update vectorial force */
325 fix0 = _mm256_add_pd(fix0,tx);
326 fiy0 = _mm256_add_pd(fiy0,ty);
327 fiz0 = _mm256_add_pd(fiz0,tz);
329 fjx0 = _mm256_add_pd(fjx0,tx);
330 fjy0 = _mm256_add_pd(fjy0,ty);
331 fjz0 = _mm256_add_pd(fjz0,tz);
335 /**************************
336 * CALCULATE INTERACTIONS *
337 **************************/
339 if (gmx_mm256_any_lt(rsq10,rcutoff2))
342 r10 = _mm256_mul_pd(rsq10,rinv10);
344 /* Compute parameters for interactions between i and j atoms */
345 qq10 = _mm256_mul_pd(iq1,jq0);
347 /* EWALD ELECTROSTATICS */
349 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
350 ewrt = _mm256_mul_pd(r10,ewtabscale);
351 ewitab = _mm256_cvttpd_epi32(ewrt);
352 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
353 ewitab = _mm_slli_epi32(ewitab,2);
354 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
355 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
356 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
357 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
358 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
359 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
360 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
361 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
362 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
364 d = _mm256_sub_pd(r10,rswitch);
365 d = _mm256_max_pd(d,_mm256_setzero_pd());
366 d2 = _mm256_mul_pd(d,d);
367 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
369 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
371 /* Evaluate switch function */
372 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
373 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
374 velec = _mm256_mul_pd(velec,sw);
375 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
377 /* Update potential sum for this i atom from the interaction with this j atom. */
378 velec = _mm256_and_pd(velec,cutoff_mask);
379 velecsum = _mm256_add_pd(velecsum,velec);
383 fscal = _mm256_and_pd(fscal,cutoff_mask);
385 /* Calculate temporary vectorial force */
386 tx = _mm256_mul_pd(fscal,dx10);
387 ty = _mm256_mul_pd(fscal,dy10);
388 tz = _mm256_mul_pd(fscal,dz10);
390 /* Update vectorial force */
391 fix1 = _mm256_add_pd(fix1,tx);
392 fiy1 = _mm256_add_pd(fiy1,ty);
393 fiz1 = _mm256_add_pd(fiz1,tz);
395 fjx0 = _mm256_add_pd(fjx0,tx);
396 fjy0 = _mm256_add_pd(fjy0,ty);
397 fjz0 = _mm256_add_pd(fjz0,tz);
401 /**************************
402 * CALCULATE INTERACTIONS *
403 **************************/
405 if (gmx_mm256_any_lt(rsq20,rcutoff2))
408 r20 = _mm256_mul_pd(rsq20,rinv20);
410 /* Compute parameters for interactions between i and j atoms */
411 qq20 = _mm256_mul_pd(iq2,jq0);
413 /* EWALD ELECTROSTATICS */
415 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
416 ewrt = _mm256_mul_pd(r20,ewtabscale);
417 ewitab = _mm256_cvttpd_epi32(ewrt);
418 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
419 ewitab = _mm_slli_epi32(ewitab,2);
420 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
421 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
422 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
423 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
424 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
425 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
426 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
427 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
428 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
430 d = _mm256_sub_pd(r20,rswitch);
431 d = _mm256_max_pd(d,_mm256_setzero_pd());
432 d2 = _mm256_mul_pd(d,d);
433 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
435 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
437 /* Evaluate switch function */
438 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
439 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
440 velec = _mm256_mul_pd(velec,sw);
441 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
443 /* Update potential sum for this i atom from the interaction with this j atom. */
444 velec = _mm256_and_pd(velec,cutoff_mask);
445 velecsum = _mm256_add_pd(velecsum,velec);
449 fscal = _mm256_and_pd(fscal,cutoff_mask);
451 /* Calculate temporary vectorial force */
452 tx = _mm256_mul_pd(fscal,dx20);
453 ty = _mm256_mul_pd(fscal,dy20);
454 tz = _mm256_mul_pd(fscal,dz20);
456 /* Update vectorial force */
457 fix2 = _mm256_add_pd(fix2,tx);
458 fiy2 = _mm256_add_pd(fiy2,ty);
459 fiz2 = _mm256_add_pd(fiz2,tz);
461 fjx0 = _mm256_add_pd(fjx0,tx);
462 fjy0 = _mm256_add_pd(fjy0,ty);
463 fjz0 = _mm256_add_pd(fjz0,tz);
467 fjptrA = f+j_coord_offsetA;
468 fjptrB = f+j_coord_offsetB;
469 fjptrC = f+j_coord_offsetC;
470 fjptrD = f+j_coord_offsetD;
472 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
474 /* Inner loop uses 216 flops */
480 /* Get j neighbor index, and coordinate index */
481 jnrlistA = jjnr[jidx];
482 jnrlistB = jjnr[jidx+1];
483 jnrlistC = jjnr[jidx+2];
484 jnrlistD = jjnr[jidx+3];
485 /* Sign of each element will be negative for non-real atoms.
486 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
487 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
489 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
491 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
492 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
493 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
495 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
496 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
497 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
498 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
499 j_coord_offsetA = DIM*jnrA;
500 j_coord_offsetB = DIM*jnrB;
501 j_coord_offsetC = DIM*jnrC;
502 j_coord_offsetD = DIM*jnrD;
504 /* load j atom coordinates */
505 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
506 x+j_coord_offsetC,x+j_coord_offsetD,
509 /* Calculate displacement vector */
510 dx00 = _mm256_sub_pd(ix0,jx0);
511 dy00 = _mm256_sub_pd(iy0,jy0);
512 dz00 = _mm256_sub_pd(iz0,jz0);
513 dx10 = _mm256_sub_pd(ix1,jx0);
514 dy10 = _mm256_sub_pd(iy1,jy0);
515 dz10 = _mm256_sub_pd(iz1,jz0);
516 dx20 = _mm256_sub_pd(ix2,jx0);
517 dy20 = _mm256_sub_pd(iy2,jy0);
518 dz20 = _mm256_sub_pd(iz2,jz0);
520 /* Calculate squared distance and things based on it */
521 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
522 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
523 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
525 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
526 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
527 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
529 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
530 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
531 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
533 /* Load parameters for j particles */
534 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
535 charge+jnrC+0,charge+jnrD+0);
536 vdwjidx0A = 2*vdwtype[jnrA+0];
537 vdwjidx0B = 2*vdwtype[jnrB+0];
538 vdwjidx0C = 2*vdwtype[jnrC+0];
539 vdwjidx0D = 2*vdwtype[jnrD+0];
541 fjx0 = _mm256_setzero_pd();
542 fjy0 = _mm256_setzero_pd();
543 fjz0 = _mm256_setzero_pd();
545 /**************************
546 * CALCULATE INTERACTIONS *
547 **************************/
549 if (gmx_mm256_any_lt(rsq00,rcutoff2))
552 r00 = _mm256_mul_pd(rsq00,rinv00);
553 r00 = _mm256_andnot_pd(dummy_mask,r00);
555 /* Compute parameters for interactions between i and j atoms */
556 qq00 = _mm256_mul_pd(iq0,jq0);
557 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
558 vdwioffsetptr0+vdwjidx0B,
559 vdwioffsetptr0+vdwjidx0C,
560 vdwioffsetptr0+vdwjidx0D,
563 /* EWALD ELECTROSTATICS */
565 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
566 ewrt = _mm256_mul_pd(r00,ewtabscale);
567 ewitab = _mm256_cvttpd_epi32(ewrt);
568 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
569 ewitab = _mm_slli_epi32(ewitab,2);
570 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
571 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
572 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
573 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
574 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
575 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
576 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
577 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
578 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
580 /* LENNARD-JONES DISPERSION/REPULSION */
582 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
583 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
584 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
585 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
586 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
588 d = _mm256_sub_pd(r00,rswitch);
589 d = _mm256_max_pd(d,_mm256_setzero_pd());
590 d2 = _mm256_mul_pd(d,d);
591 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
593 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
595 /* Evaluate switch function */
596 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
597 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
598 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
599 velec = _mm256_mul_pd(velec,sw);
600 vvdw = _mm256_mul_pd(vvdw,sw);
601 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
603 /* Update potential sum for this i atom from the interaction with this j atom. */
604 velec = _mm256_and_pd(velec,cutoff_mask);
605 velec = _mm256_andnot_pd(dummy_mask,velec);
606 velecsum = _mm256_add_pd(velecsum,velec);
607 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
608 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
609 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
611 fscal = _mm256_add_pd(felec,fvdw);
613 fscal = _mm256_and_pd(fscal,cutoff_mask);
615 fscal = _mm256_andnot_pd(dummy_mask,fscal);
617 /* Calculate temporary vectorial force */
618 tx = _mm256_mul_pd(fscal,dx00);
619 ty = _mm256_mul_pd(fscal,dy00);
620 tz = _mm256_mul_pd(fscal,dz00);
622 /* Update vectorial force */
623 fix0 = _mm256_add_pd(fix0,tx);
624 fiy0 = _mm256_add_pd(fiy0,ty);
625 fiz0 = _mm256_add_pd(fiz0,tz);
627 fjx0 = _mm256_add_pd(fjx0,tx);
628 fjy0 = _mm256_add_pd(fjy0,ty);
629 fjz0 = _mm256_add_pd(fjz0,tz);
633 /**************************
634 * CALCULATE INTERACTIONS *
635 **************************/
637 if (gmx_mm256_any_lt(rsq10,rcutoff2))
640 r10 = _mm256_mul_pd(rsq10,rinv10);
641 r10 = _mm256_andnot_pd(dummy_mask,r10);
643 /* Compute parameters for interactions between i and j atoms */
644 qq10 = _mm256_mul_pd(iq1,jq0);
646 /* EWALD ELECTROSTATICS */
648 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
649 ewrt = _mm256_mul_pd(r10,ewtabscale);
650 ewitab = _mm256_cvttpd_epi32(ewrt);
651 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
652 ewitab = _mm_slli_epi32(ewitab,2);
653 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
654 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
655 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
656 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
657 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
658 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
659 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
660 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
661 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
663 d = _mm256_sub_pd(r10,rswitch);
664 d = _mm256_max_pd(d,_mm256_setzero_pd());
665 d2 = _mm256_mul_pd(d,d);
666 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
668 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
670 /* Evaluate switch function */
671 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
672 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
673 velec = _mm256_mul_pd(velec,sw);
674 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
676 /* Update potential sum for this i atom from the interaction with this j atom. */
677 velec = _mm256_and_pd(velec,cutoff_mask);
678 velec = _mm256_andnot_pd(dummy_mask,velec);
679 velecsum = _mm256_add_pd(velecsum,velec);
683 fscal = _mm256_and_pd(fscal,cutoff_mask);
685 fscal = _mm256_andnot_pd(dummy_mask,fscal);
687 /* Calculate temporary vectorial force */
688 tx = _mm256_mul_pd(fscal,dx10);
689 ty = _mm256_mul_pd(fscal,dy10);
690 tz = _mm256_mul_pd(fscal,dz10);
692 /* Update vectorial force */
693 fix1 = _mm256_add_pd(fix1,tx);
694 fiy1 = _mm256_add_pd(fiy1,ty);
695 fiz1 = _mm256_add_pd(fiz1,tz);
697 fjx0 = _mm256_add_pd(fjx0,tx);
698 fjy0 = _mm256_add_pd(fjy0,ty);
699 fjz0 = _mm256_add_pd(fjz0,tz);
703 /**************************
704 * CALCULATE INTERACTIONS *
705 **************************/
707 if (gmx_mm256_any_lt(rsq20,rcutoff2))
710 r20 = _mm256_mul_pd(rsq20,rinv20);
711 r20 = _mm256_andnot_pd(dummy_mask,r20);
713 /* Compute parameters for interactions between i and j atoms */
714 qq20 = _mm256_mul_pd(iq2,jq0);
716 /* EWALD ELECTROSTATICS */
718 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
719 ewrt = _mm256_mul_pd(r20,ewtabscale);
720 ewitab = _mm256_cvttpd_epi32(ewrt);
721 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
722 ewitab = _mm_slli_epi32(ewitab,2);
723 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
724 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
725 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
726 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
727 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
728 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
729 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
730 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
731 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
733 d = _mm256_sub_pd(r20,rswitch);
734 d = _mm256_max_pd(d,_mm256_setzero_pd());
735 d2 = _mm256_mul_pd(d,d);
736 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
738 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
740 /* Evaluate switch function */
741 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
742 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
743 velec = _mm256_mul_pd(velec,sw);
744 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
746 /* Update potential sum for this i atom from the interaction with this j atom. */
747 velec = _mm256_and_pd(velec,cutoff_mask);
748 velec = _mm256_andnot_pd(dummy_mask,velec);
749 velecsum = _mm256_add_pd(velecsum,velec);
753 fscal = _mm256_and_pd(fscal,cutoff_mask);
755 fscal = _mm256_andnot_pd(dummy_mask,fscal);
757 /* Calculate temporary vectorial force */
758 tx = _mm256_mul_pd(fscal,dx20);
759 ty = _mm256_mul_pd(fscal,dy20);
760 tz = _mm256_mul_pd(fscal,dz20);
762 /* Update vectorial force */
763 fix2 = _mm256_add_pd(fix2,tx);
764 fiy2 = _mm256_add_pd(fiy2,ty);
765 fiz2 = _mm256_add_pd(fiz2,tz);
767 fjx0 = _mm256_add_pd(fjx0,tx);
768 fjy0 = _mm256_add_pd(fjy0,ty);
769 fjz0 = _mm256_add_pd(fjz0,tz);
773 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
774 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
775 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
776 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
778 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
780 /* Inner loop uses 219 flops */
783 /* End of innermost loop */
785 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
786 f+i_coord_offset,fshift+i_shift_offset);
789 /* Update potential energies */
790 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
791 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
793 /* Increment number of inner iterations */
794 inneriter += j_index_end - j_index_start;
796 /* Outer loop uses 20 flops */
799 /* Increment number of outer iterations */
802 /* Update outer/inner flops */
804 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*219);
807 * Gromacs nonbonded kernel: nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_256_double
808 * Electrostatics interaction: Ewald
809 * VdW interaction: LennardJones
810 * Geometry: Water3-Particle
811 * Calculate force/pot: Force
814 nb_kernel_ElecEwSw_VdwLJSw_GeomW3P1_F_avx_256_double
815 (t_nblist * gmx_restrict nlist,
816 rvec * gmx_restrict xx,
817 rvec * gmx_restrict ff,
818 t_forcerec * gmx_restrict fr,
819 t_mdatoms * gmx_restrict mdatoms,
820 nb_kernel_data_t * gmx_restrict kernel_data,
821 t_nrnb * gmx_restrict nrnb)
823 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
824 * just 0 for non-waters.
825 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
826 * jnr indices corresponding to data put in the four positions in the SIMD register.
828 int i_shift_offset,i_coord_offset,outeriter,inneriter;
829 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
830 int jnrA,jnrB,jnrC,jnrD;
831 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
832 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
833 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
834 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
836 real *shiftvec,*fshift,*x,*f;
837 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
839 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
840 real * vdwioffsetptr0;
841 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
842 real * vdwioffsetptr1;
843 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
844 real * vdwioffsetptr2;
845 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
846 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
847 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
848 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
849 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
850 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
851 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
854 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
857 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
858 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
860 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
861 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
863 __m256d rswitch,swV3,swV4,swV5,swF2,swF3,swF4,d,d2,sw,dsw;
864 real rswitch_scalar,d_scalar;
865 __m256d dummy_mask,cutoff_mask;
866 __m128 tmpmask0,tmpmask1;
867 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
868 __m256d one = _mm256_set1_pd(1.0);
869 __m256d two = _mm256_set1_pd(2.0);
875 jindex = nlist->jindex;
877 shiftidx = nlist->shift;
879 shiftvec = fr->shift_vec[0];
880 fshift = fr->fshift[0];
881 facel = _mm256_set1_pd(fr->epsfac);
882 charge = mdatoms->chargeA;
883 nvdwtype = fr->ntype;
885 vdwtype = mdatoms->typeA;
887 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
888 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
889 beta2 = _mm256_mul_pd(beta,beta);
890 beta3 = _mm256_mul_pd(beta,beta2);
892 ewtab = fr->ic->tabq_coul_FDV0;
893 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
894 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
896 /* Setup water-specific parameters */
897 inr = nlist->iinr[0];
898 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
899 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
900 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
901 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
903 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
904 rcutoff_scalar = fr->rcoulomb;
905 rcutoff = _mm256_set1_pd(rcutoff_scalar);
906 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
908 rswitch_scalar = fr->rcoulomb_switch;
909 rswitch = _mm256_set1_pd(rswitch_scalar);
910 /* Setup switch parameters */
911 d_scalar = rcutoff_scalar-rswitch_scalar;
912 d = _mm256_set1_pd(d_scalar);
913 swV3 = _mm256_set1_pd(-10.0/(d_scalar*d_scalar*d_scalar));
914 swV4 = _mm256_set1_pd( 15.0/(d_scalar*d_scalar*d_scalar*d_scalar));
915 swV5 = _mm256_set1_pd( -6.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
916 swF2 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar));
917 swF3 = _mm256_set1_pd( 60.0/(d_scalar*d_scalar*d_scalar*d_scalar));
918 swF4 = _mm256_set1_pd(-30.0/(d_scalar*d_scalar*d_scalar*d_scalar*d_scalar));
920 /* Avoid stupid compiler warnings */
921 jnrA = jnrB = jnrC = jnrD = 0;
930 for(iidx=0;iidx<4*DIM;iidx++)
935 /* Start outer loop over neighborlists */
936 for(iidx=0; iidx<nri; iidx++)
938 /* Load shift vector for this list */
939 i_shift_offset = DIM*shiftidx[iidx];
941 /* Load limits for loop over neighbors */
942 j_index_start = jindex[iidx];
943 j_index_end = jindex[iidx+1];
945 /* Get outer coordinate index */
947 i_coord_offset = DIM*inr;
949 /* Load i particle coords and add shift vector */
950 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
951 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
953 fix0 = _mm256_setzero_pd();
954 fiy0 = _mm256_setzero_pd();
955 fiz0 = _mm256_setzero_pd();
956 fix1 = _mm256_setzero_pd();
957 fiy1 = _mm256_setzero_pd();
958 fiz1 = _mm256_setzero_pd();
959 fix2 = _mm256_setzero_pd();
960 fiy2 = _mm256_setzero_pd();
961 fiz2 = _mm256_setzero_pd();
963 /* Start inner kernel loop */
964 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
967 /* Get j neighbor index, and coordinate index */
972 j_coord_offsetA = DIM*jnrA;
973 j_coord_offsetB = DIM*jnrB;
974 j_coord_offsetC = DIM*jnrC;
975 j_coord_offsetD = DIM*jnrD;
977 /* load j atom coordinates */
978 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
979 x+j_coord_offsetC,x+j_coord_offsetD,
982 /* Calculate displacement vector */
983 dx00 = _mm256_sub_pd(ix0,jx0);
984 dy00 = _mm256_sub_pd(iy0,jy0);
985 dz00 = _mm256_sub_pd(iz0,jz0);
986 dx10 = _mm256_sub_pd(ix1,jx0);
987 dy10 = _mm256_sub_pd(iy1,jy0);
988 dz10 = _mm256_sub_pd(iz1,jz0);
989 dx20 = _mm256_sub_pd(ix2,jx0);
990 dy20 = _mm256_sub_pd(iy2,jy0);
991 dz20 = _mm256_sub_pd(iz2,jz0);
993 /* Calculate squared distance and things based on it */
994 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
995 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
996 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
998 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
999 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1000 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1002 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1003 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1004 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1006 /* Load parameters for j particles */
1007 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1008 charge+jnrC+0,charge+jnrD+0);
1009 vdwjidx0A = 2*vdwtype[jnrA+0];
1010 vdwjidx0B = 2*vdwtype[jnrB+0];
1011 vdwjidx0C = 2*vdwtype[jnrC+0];
1012 vdwjidx0D = 2*vdwtype[jnrD+0];
1014 fjx0 = _mm256_setzero_pd();
1015 fjy0 = _mm256_setzero_pd();
1016 fjz0 = _mm256_setzero_pd();
1018 /**************************
1019 * CALCULATE INTERACTIONS *
1020 **************************/
1022 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1025 r00 = _mm256_mul_pd(rsq00,rinv00);
1027 /* Compute parameters for interactions between i and j atoms */
1028 qq00 = _mm256_mul_pd(iq0,jq0);
1029 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1030 vdwioffsetptr0+vdwjidx0B,
1031 vdwioffsetptr0+vdwjidx0C,
1032 vdwioffsetptr0+vdwjidx0D,
1035 /* EWALD ELECTROSTATICS */
1037 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1038 ewrt = _mm256_mul_pd(r00,ewtabscale);
1039 ewitab = _mm256_cvttpd_epi32(ewrt);
1040 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1041 ewitab = _mm_slli_epi32(ewitab,2);
1042 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1043 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1044 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1045 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1046 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1047 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1048 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1049 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
1050 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1052 /* LENNARD-JONES DISPERSION/REPULSION */
1054 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1055 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1056 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1057 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1058 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1060 d = _mm256_sub_pd(r00,rswitch);
1061 d = _mm256_max_pd(d,_mm256_setzero_pd());
1062 d2 = _mm256_mul_pd(d,d);
1063 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1065 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1067 /* Evaluate switch function */
1068 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1069 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
1070 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1071 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1073 fscal = _mm256_add_pd(felec,fvdw);
1075 fscal = _mm256_and_pd(fscal,cutoff_mask);
1077 /* Calculate temporary vectorial force */
1078 tx = _mm256_mul_pd(fscal,dx00);
1079 ty = _mm256_mul_pd(fscal,dy00);
1080 tz = _mm256_mul_pd(fscal,dz00);
1082 /* Update vectorial force */
1083 fix0 = _mm256_add_pd(fix0,tx);
1084 fiy0 = _mm256_add_pd(fiy0,ty);
1085 fiz0 = _mm256_add_pd(fiz0,tz);
1087 fjx0 = _mm256_add_pd(fjx0,tx);
1088 fjy0 = _mm256_add_pd(fjy0,ty);
1089 fjz0 = _mm256_add_pd(fjz0,tz);
1093 /**************************
1094 * CALCULATE INTERACTIONS *
1095 **************************/
1097 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1100 r10 = _mm256_mul_pd(rsq10,rinv10);
1102 /* Compute parameters for interactions between i and j atoms */
1103 qq10 = _mm256_mul_pd(iq1,jq0);
1105 /* EWALD ELECTROSTATICS */
1107 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1108 ewrt = _mm256_mul_pd(r10,ewtabscale);
1109 ewitab = _mm256_cvttpd_epi32(ewrt);
1110 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1111 ewitab = _mm_slli_epi32(ewitab,2);
1112 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1113 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1114 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1115 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1116 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1117 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1118 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1119 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1120 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1122 d = _mm256_sub_pd(r10,rswitch);
1123 d = _mm256_max_pd(d,_mm256_setzero_pd());
1124 d2 = _mm256_mul_pd(d,d);
1125 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1127 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1129 /* Evaluate switch function */
1130 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1131 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
1132 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1136 fscal = _mm256_and_pd(fscal,cutoff_mask);
1138 /* Calculate temporary vectorial force */
1139 tx = _mm256_mul_pd(fscal,dx10);
1140 ty = _mm256_mul_pd(fscal,dy10);
1141 tz = _mm256_mul_pd(fscal,dz10);
1143 /* Update vectorial force */
1144 fix1 = _mm256_add_pd(fix1,tx);
1145 fiy1 = _mm256_add_pd(fiy1,ty);
1146 fiz1 = _mm256_add_pd(fiz1,tz);
1148 fjx0 = _mm256_add_pd(fjx0,tx);
1149 fjy0 = _mm256_add_pd(fjy0,ty);
1150 fjz0 = _mm256_add_pd(fjz0,tz);
1154 /**************************
1155 * CALCULATE INTERACTIONS *
1156 **************************/
1158 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1161 r20 = _mm256_mul_pd(rsq20,rinv20);
1163 /* Compute parameters for interactions between i and j atoms */
1164 qq20 = _mm256_mul_pd(iq2,jq0);
1166 /* EWALD ELECTROSTATICS */
1168 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1169 ewrt = _mm256_mul_pd(r20,ewtabscale);
1170 ewitab = _mm256_cvttpd_epi32(ewrt);
1171 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1172 ewitab = _mm_slli_epi32(ewitab,2);
1173 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1174 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1175 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1176 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1177 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1178 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1179 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1180 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1181 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1183 d = _mm256_sub_pd(r20,rswitch);
1184 d = _mm256_max_pd(d,_mm256_setzero_pd());
1185 d2 = _mm256_mul_pd(d,d);
1186 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1188 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1190 /* Evaluate switch function */
1191 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1192 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
1193 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1197 fscal = _mm256_and_pd(fscal,cutoff_mask);
1199 /* Calculate temporary vectorial force */
1200 tx = _mm256_mul_pd(fscal,dx20);
1201 ty = _mm256_mul_pd(fscal,dy20);
1202 tz = _mm256_mul_pd(fscal,dz20);
1204 /* Update vectorial force */
1205 fix2 = _mm256_add_pd(fix2,tx);
1206 fiy2 = _mm256_add_pd(fiy2,ty);
1207 fiz2 = _mm256_add_pd(fiz2,tz);
1209 fjx0 = _mm256_add_pd(fjx0,tx);
1210 fjy0 = _mm256_add_pd(fjy0,ty);
1211 fjz0 = _mm256_add_pd(fjz0,tz);
1215 fjptrA = f+j_coord_offsetA;
1216 fjptrB = f+j_coord_offsetB;
1217 fjptrC = f+j_coord_offsetC;
1218 fjptrD = f+j_coord_offsetD;
1220 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1222 /* Inner loop uses 204 flops */
1225 if(jidx<j_index_end)
1228 /* Get j neighbor index, and coordinate index */
1229 jnrlistA = jjnr[jidx];
1230 jnrlistB = jjnr[jidx+1];
1231 jnrlistC = jjnr[jidx+2];
1232 jnrlistD = jjnr[jidx+3];
1233 /* Sign of each element will be negative for non-real atoms.
1234 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1235 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1237 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1239 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1240 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1241 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1243 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1244 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1245 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1246 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1247 j_coord_offsetA = DIM*jnrA;
1248 j_coord_offsetB = DIM*jnrB;
1249 j_coord_offsetC = DIM*jnrC;
1250 j_coord_offsetD = DIM*jnrD;
1252 /* load j atom coordinates */
1253 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1254 x+j_coord_offsetC,x+j_coord_offsetD,
1257 /* Calculate displacement vector */
1258 dx00 = _mm256_sub_pd(ix0,jx0);
1259 dy00 = _mm256_sub_pd(iy0,jy0);
1260 dz00 = _mm256_sub_pd(iz0,jz0);
1261 dx10 = _mm256_sub_pd(ix1,jx0);
1262 dy10 = _mm256_sub_pd(iy1,jy0);
1263 dz10 = _mm256_sub_pd(iz1,jz0);
1264 dx20 = _mm256_sub_pd(ix2,jx0);
1265 dy20 = _mm256_sub_pd(iy2,jy0);
1266 dz20 = _mm256_sub_pd(iz2,jz0);
1268 /* Calculate squared distance and things based on it */
1269 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1270 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1271 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1273 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1274 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1275 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1277 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1278 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1279 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1281 /* Load parameters for j particles */
1282 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1283 charge+jnrC+0,charge+jnrD+0);
1284 vdwjidx0A = 2*vdwtype[jnrA+0];
1285 vdwjidx0B = 2*vdwtype[jnrB+0];
1286 vdwjidx0C = 2*vdwtype[jnrC+0];
1287 vdwjidx0D = 2*vdwtype[jnrD+0];
1289 fjx0 = _mm256_setzero_pd();
1290 fjy0 = _mm256_setzero_pd();
1291 fjz0 = _mm256_setzero_pd();
1293 /**************************
1294 * CALCULATE INTERACTIONS *
1295 **************************/
1297 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1300 r00 = _mm256_mul_pd(rsq00,rinv00);
1301 r00 = _mm256_andnot_pd(dummy_mask,r00);
1303 /* Compute parameters for interactions between i and j atoms */
1304 qq00 = _mm256_mul_pd(iq0,jq0);
1305 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1306 vdwioffsetptr0+vdwjidx0B,
1307 vdwioffsetptr0+vdwjidx0C,
1308 vdwioffsetptr0+vdwjidx0D,
1311 /* EWALD ELECTROSTATICS */
1313 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1314 ewrt = _mm256_mul_pd(r00,ewtabscale);
1315 ewitab = _mm256_cvttpd_epi32(ewrt);
1316 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1317 ewitab = _mm_slli_epi32(ewitab,2);
1318 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1319 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1320 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1321 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1322 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1323 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1324 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1325 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
1326 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1328 /* LENNARD-JONES DISPERSION/REPULSION */
1330 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1331 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
1332 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
1333 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
1334 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
1336 d = _mm256_sub_pd(r00,rswitch);
1337 d = _mm256_max_pd(d,_mm256_setzero_pd());
1338 d2 = _mm256_mul_pd(d,d);
1339 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1341 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1343 /* Evaluate switch function */
1344 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1345 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(velec,dsw)) );
1346 fvdw = _mm256_sub_pd( _mm256_mul_pd(fvdw,sw) , _mm256_mul_pd(rinv00,_mm256_mul_pd(vvdw,dsw)) );
1347 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1349 fscal = _mm256_add_pd(felec,fvdw);
1351 fscal = _mm256_and_pd(fscal,cutoff_mask);
1353 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1355 /* Calculate temporary vectorial force */
1356 tx = _mm256_mul_pd(fscal,dx00);
1357 ty = _mm256_mul_pd(fscal,dy00);
1358 tz = _mm256_mul_pd(fscal,dz00);
1360 /* Update vectorial force */
1361 fix0 = _mm256_add_pd(fix0,tx);
1362 fiy0 = _mm256_add_pd(fiy0,ty);
1363 fiz0 = _mm256_add_pd(fiz0,tz);
1365 fjx0 = _mm256_add_pd(fjx0,tx);
1366 fjy0 = _mm256_add_pd(fjy0,ty);
1367 fjz0 = _mm256_add_pd(fjz0,tz);
1371 /**************************
1372 * CALCULATE INTERACTIONS *
1373 **************************/
1375 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1378 r10 = _mm256_mul_pd(rsq10,rinv10);
1379 r10 = _mm256_andnot_pd(dummy_mask,r10);
1381 /* Compute parameters for interactions between i and j atoms */
1382 qq10 = _mm256_mul_pd(iq1,jq0);
1384 /* EWALD ELECTROSTATICS */
1386 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1387 ewrt = _mm256_mul_pd(r10,ewtabscale);
1388 ewitab = _mm256_cvttpd_epi32(ewrt);
1389 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1390 ewitab = _mm_slli_epi32(ewitab,2);
1391 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1392 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1393 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1394 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1395 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1396 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1397 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1398 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
1399 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1401 d = _mm256_sub_pd(r10,rswitch);
1402 d = _mm256_max_pd(d,_mm256_setzero_pd());
1403 d2 = _mm256_mul_pd(d,d);
1404 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1406 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1408 /* Evaluate switch function */
1409 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1410 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv10,_mm256_mul_pd(velec,dsw)) );
1411 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1415 fscal = _mm256_and_pd(fscal,cutoff_mask);
1417 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1419 /* Calculate temporary vectorial force */
1420 tx = _mm256_mul_pd(fscal,dx10);
1421 ty = _mm256_mul_pd(fscal,dy10);
1422 tz = _mm256_mul_pd(fscal,dz10);
1424 /* Update vectorial force */
1425 fix1 = _mm256_add_pd(fix1,tx);
1426 fiy1 = _mm256_add_pd(fiy1,ty);
1427 fiz1 = _mm256_add_pd(fiz1,tz);
1429 fjx0 = _mm256_add_pd(fjx0,tx);
1430 fjy0 = _mm256_add_pd(fjy0,ty);
1431 fjz0 = _mm256_add_pd(fjz0,tz);
1435 /**************************
1436 * CALCULATE INTERACTIONS *
1437 **************************/
1439 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1442 r20 = _mm256_mul_pd(rsq20,rinv20);
1443 r20 = _mm256_andnot_pd(dummy_mask,r20);
1445 /* Compute parameters for interactions between i and j atoms */
1446 qq20 = _mm256_mul_pd(iq2,jq0);
1448 /* EWALD ELECTROSTATICS */
1450 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1451 ewrt = _mm256_mul_pd(r20,ewtabscale);
1452 ewitab = _mm256_cvttpd_epi32(ewrt);
1453 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1454 ewitab = _mm_slli_epi32(ewitab,2);
1455 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
1456 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
1457 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
1458 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
1459 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
1460 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
1461 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
1462 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
1463 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1465 d = _mm256_sub_pd(r20,rswitch);
1466 d = _mm256_max_pd(d,_mm256_setzero_pd());
1467 d2 = _mm256_mul_pd(d,d);
1468 sw = _mm256_add_pd(one,_mm256_mul_pd(d2,_mm256_mul_pd(d,_mm256_add_pd(swV3,_mm256_mul_pd(d,_mm256_add_pd(swV4,_mm256_mul_pd(d,swV5)))))));
1470 dsw = _mm256_mul_pd(d2,_mm256_add_pd(swF2,_mm256_mul_pd(d,_mm256_add_pd(swF3,_mm256_mul_pd(d,swF4)))));
1472 /* Evaluate switch function */
1473 /* fscal'=f'/r=-(v*sw)'/r=-(v'*sw+v*dsw)/r=-v'*sw/r-v*dsw/r=fscal*sw-v*dsw/r */
1474 felec = _mm256_sub_pd( _mm256_mul_pd(felec,sw) , _mm256_mul_pd(rinv20,_mm256_mul_pd(velec,dsw)) );
1475 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1479 fscal = _mm256_and_pd(fscal,cutoff_mask);
1481 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1483 /* Calculate temporary vectorial force */
1484 tx = _mm256_mul_pd(fscal,dx20);
1485 ty = _mm256_mul_pd(fscal,dy20);
1486 tz = _mm256_mul_pd(fscal,dz20);
1488 /* Update vectorial force */
1489 fix2 = _mm256_add_pd(fix2,tx);
1490 fiy2 = _mm256_add_pd(fiy2,ty);
1491 fiz2 = _mm256_add_pd(fiz2,tz);
1493 fjx0 = _mm256_add_pd(fjx0,tx);
1494 fjy0 = _mm256_add_pd(fjy0,ty);
1495 fjz0 = _mm256_add_pd(fjz0,tz);
1499 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1500 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1501 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1502 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1504 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1506 /* Inner loop uses 207 flops */
1509 /* End of innermost loop */
1511 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1512 f+i_coord_offset,fshift+i_shift_offset);
1514 /* Increment number of inner iterations */
1515 inneriter += j_index_end - j_index_start;
1517 /* Outer loop uses 18 flops */
1520 /* Increment number of outer iterations */
1523 /* Update outer/inner flops */
1525 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*207);