2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEw_VdwLJ_GeomW4P1_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
81 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
83 __m256d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
84 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
87 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
90 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
91 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
93 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
94 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
96 __m256d dummy_mask,cutoff_mask;
97 __m128 tmpmask0,tmpmask1;
98 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
99 __m256d one = _mm256_set1_pd(1.0);
100 __m256d two = _mm256_set1_pd(2.0);
106 jindex = nlist->jindex;
108 shiftidx = nlist->shift;
110 shiftvec = fr->shift_vec[0];
111 fshift = fr->fshift[0];
112 facel = _mm256_set1_pd(fr->epsfac);
113 charge = mdatoms->chargeA;
114 nvdwtype = fr->ntype;
116 vdwtype = mdatoms->typeA;
118 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
119 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
120 beta2 = _mm256_mul_pd(beta,beta);
121 beta3 = _mm256_mul_pd(beta,beta2);
123 ewtab = fr->ic->tabq_coul_FDV0;
124 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
125 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
127 /* Setup water-specific parameters */
128 inr = nlist->iinr[0];
129 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
130 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
131 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
132 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
134 /* Avoid stupid compiler warnings */
135 jnrA = jnrB = jnrC = jnrD = 0;
144 for(iidx=0;iidx<4*DIM;iidx++)
149 /* Start outer loop over neighborlists */
150 for(iidx=0; iidx<nri; iidx++)
152 /* Load shift vector for this list */
153 i_shift_offset = DIM*shiftidx[iidx];
155 /* Load limits for loop over neighbors */
156 j_index_start = jindex[iidx];
157 j_index_end = jindex[iidx+1];
159 /* Get outer coordinate index */
161 i_coord_offset = DIM*inr;
163 /* Load i particle coords and add shift vector */
164 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
165 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
167 fix0 = _mm256_setzero_pd();
168 fiy0 = _mm256_setzero_pd();
169 fiz0 = _mm256_setzero_pd();
170 fix1 = _mm256_setzero_pd();
171 fiy1 = _mm256_setzero_pd();
172 fiz1 = _mm256_setzero_pd();
173 fix2 = _mm256_setzero_pd();
174 fiy2 = _mm256_setzero_pd();
175 fiz2 = _mm256_setzero_pd();
176 fix3 = _mm256_setzero_pd();
177 fiy3 = _mm256_setzero_pd();
178 fiz3 = _mm256_setzero_pd();
180 /* Reset potential sums */
181 velecsum = _mm256_setzero_pd();
182 vvdwsum = _mm256_setzero_pd();
184 /* Start inner kernel loop */
185 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
188 /* Get j neighbor index, and coordinate index */
193 j_coord_offsetA = DIM*jnrA;
194 j_coord_offsetB = DIM*jnrB;
195 j_coord_offsetC = DIM*jnrC;
196 j_coord_offsetD = DIM*jnrD;
198 /* load j atom coordinates */
199 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
200 x+j_coord_offsetC,x+j_coord_offsetD,
203 /* Calculate displacement vector */
204 dx00 = _mm256_sub_pd(ix0,jx0);
205 dy00 = _mm256_sub_pd(iy0,jy0);
206 dz00 = _mm256_sub_pd(iz0,jz0);
207 dx10 = _mm256_sub_pd(ix1,jx0);
208 dy10 = _mm256_sub_pd(iy1,jy0);
209 dz10 = _mm256_sub_pd(iz1,jz0);
210 dx20 = _mm256_sub_pd(ix2,jx0);
211 dy20 = _mm256_sub_pd(iy2,jy0);
212 dz20 = _mm256_sub_pd(iz2,jz0);
213 dx30 = _mm256_sub_pd(ix3,jx0);
214 dy30 = _mm256_sub_pd(iy3,jy0);
215 dz30 = _mm256_sub_pd(iz3,jz0);
217 /* Calculate squared distance and things based on it */
218 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
219 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
220 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
221 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
223 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
224 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
225 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
227 rinvsq00 = gmx_mm256_inv_pd(rsq00);
228 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
229 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
230 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
232 /* Load parameters for j particles */
233 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
234 charge+jnrC+0,charge+jnrD+0);
235 vdwjidx0A = 2*vdwtype[jnrA+0];
236 vdwjidx0B = 2*vdwtype[jnrB+0];
237 vdwjidx0C = 2*vdwtype[jnrC+0];
238 vdwjidx0D = 2*vdwtype[jnrD+0];
240 fjx0 = _mm256_setzero_pd();
241 fjy0 = _mm256_setzero_pd();
242 fjz0 = _mm256_setzero_pd();
244 /**************************
245 * CALCULATE INTERACTIONS *
246 **************************/
248 /* Compute parameters for interactions between i and j atoms */
249 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
250 vdwioffsetptr0+vdwjidx0B,
251 vdwioffsetptr0+vdwjidx0C,
252 vdwioffsetptr0+vdwjidx0D,
255 /* LENNARD-JONES DISPERSION/REPULSION */
257 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
258 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
259 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
260 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
261 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
263 /* Update potential sum for this i atom from the interaction with this j atom. */
264 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
268 /* Calculate temporary vectorial force */
269 tx = _mm256_mul_pd(fscal,dx00);
270 ty = _mm256_mul_pd(fscal,dy00);
271 tz = _mm256_mul_pd(fscal,dz00);
273 /* Update vectorial force */
274 fix0 = _mm256_add_pd(fix0,tx);
275 fiy0 = _mm256_add_pd(fiy0,ty);
276 fiz0 = _mm256_add_pd(fiz0,tz);
278 fjx0 = _mm256_add_pd(fjx0,tx);
279 fjy0 = _mm256_add_pd(fjy0,ty);
280 fjz0 = _mm256_add_pd(fjz0,tz);
282 /**************************
283 * CALCULATE INTERACTIONS *
284 **************************/
286 r10 = _mm256_mul_pd(rsq10,rinv10);
288 /* Compute parameters for interactions between i and j atoms */
289 qq10 = _mm256_mul_pd(iq1,jq0);
291 /* EWALD ELECTROSTATICS */
293 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
294 ewrt = _mm256_mul_pd(r10,ewtabscale);
295 ewitab = _mm256_cvttpd_epi32(ewrt);
296 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
297 ewitab = _mm_slli_epi32(ewitab,2);
298 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
299 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
300 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
301 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
302 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
303 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
304 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
305 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
306 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
308 /* Update potential sum for this i atom from the interaction with this j atom. */
309 velecsum = _mm256_add_pd(velecsum,velec);
313 /* Calculate temporary vectorial force */
314 tx = _mm256_mul_pd(fscal,dx10);
315 ty = _mm256_mul_pd(fscal,dy10);
316 tz = _mm256_mul_pd(fscal,dz10);
318 /* Update vectorial force */
319 fix1 = _mm256_add_pd(fix1,tx);
320 fiy1 = _mm256_add_pd(fiy1,ty);
321 fiz1 = _mm256_add_pd(fiz1,tz);
323 fjx0 = _mm256_add_pd(fjx0,tx);
324 fjy0 = _mm256_add_pd(fjy0,ty);
325 fjz0 = _mm256_add_pd(fjz0,tz);
327 /**************************
328 * CALCULATE INTERACTIONS *
329 **************************/
331 r20 = _mm256_mul_pd(rsq20,rinv20);
333 /* Compute parameters for interactions between i and j atoms */
334 qq20 = _mm256_mul_pd(iq2,jq0);
336 /* EWALD ELECTROSTATICS */
338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
339 ewrt = _mm256_mul_pd(r20,ewtabscale);
340 ewitab = _mm256_cvttpd_epi32(ewrt);
341 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
342 ewitab = _mm_slli_epi32(ewitab,2);
343 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
344 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
345 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
346 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
347 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
348 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
349 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
350 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
351 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
353 /* Update potential sum for this i atom from the interaction with this j atom. */
354 velecsum = _mm256_add_pd(velecsum,velec);
358 /* Calculate temporary vectorial force */
359 tx = _mm256_mul_pd(fscal,dx20);
360 ty = _mm256_mul_pd(fscal,dy20);
361 tz = _mm256_mul_pd(fscal,dz20);
363 /* Update vectorial force */
364 fix2 = _mm256_add_pd(fix2,tx);
365 fiy2 = _mm256_add_pd(fiy2,ty);
366 fiz2 = _mm256_add_pd(fiz2,tz);
368 fjx0 = _mm256_add_pd(fjx0,tx);
369 fjy0 = _mm256_add_pd(fjy0,ty);
370 fjz0 = _mm256_add_pd(fjz0,tz);
372 /**************************
373 * CALCULATE INTERACTIONS *
374 **************************/
376 r30 = _mm256_mul_pd(rsq30,rinv30);
378 /* Compute parameters for interactions between i and j atoms */
379 qq30 = _mm256_mul_pd(iq3,jq0);
381 /* EWALD ELECTROSTATICS */
383 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
384 ewrt = _mm256_mul_pd(r30,ewtabscale);
385 ewitab = _mm256_cvttpd_epi32(ewrt);
386 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
387 ewitab = _mm_slli_epi32(ewitab,2);
388 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
389 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
390 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
391 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
392 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
393 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
394 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
395 velec = _mm256_mul_pd(qq30,_mm256_sub_pd(rinv30,velec));
396 felec = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
398 /* Update potential sum for this i atom from the interaction with this j atom. */
399 velecsum = _mm256_add_pd(velecsum,velec);
403 /* Calculate temporary vectorial force */
404 tx = _mm256_mul_pd(fscal,dx30);
405 ty = _mm256_mul_pd(fscal,dy30);
406 tz = _mm256_mul_pd(fscal,dz30);
408 /* Update vectorial force */
409 fix3 = _mm256_add_pd(fix3,tx);
410 fiy3 = _mm256_add_pd(fiy3,ty);
411 fiz3 = _mm256_add_pd(fiz3,tz);
413 fjx0 = _mm256_add_pd(fjx0,tx);
414 fjy0 = _mm256_add_pd(fjy0,ty);
415 fjz0 = _mm256_add_pd(fjz0,tz);
417 fjptrA = f+j_coord_offsetA;
418 fjptrB = f+j_coord_offsetB;
419 fjptrC = f+j_coord_offsetC;
420 fjptrD = f+j_coord_offsetD;
422 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
424 /* Inner loop uses 158 flops */
430 /* Get j neighbor index, and coordinate index */
431 jnrlistA = jjnr[jidx];
432 jnrlistB = jjnr[jidx+1];
433 jnrlistC = jjnr[jidx+2];
434 jnrlistD = jjnr[jidx+3];
435 /* Sign of each element will be negative for non-real atoms.
436 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
437 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
439 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
441 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
442 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
443 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
445 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
446 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
447 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
448 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
449 j_coord_offsetA = DIM*jnrA;
450 j_coord_offsetB = DIM*jnrB;
451 j_coord_offsetC = DIM*jnrC;
452 j_coord_offsetD = DIM*jnrD;
454 /* load j atom coordinates */
455 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
456 x+j_coord_offsetC,x+j_coord_offsetD,
459 /* Calculate displacement vector */
460 dx00 = _mm256_sub_pd(ix0,jx0);
461 dy00 = _mm256_sub_pd(iy0,jy0);
462 dz00 = _mm256_sub_pd(iz0,jz0);
463 dx10 = _mm256_sub_pd(ix1,jx0);
464 dy10 = _mm256_sub_pd(iy1,jy0);
465 dz10 = _mm256_sub_pd(iz1,jz0);
466 dx20 = _mm256_sub_pd(ix2,jx0);
467 dy20 = _mm256_sub_pd(iy2,jy0);
468 dz20 = _mm256_sub_pd(iz2,jz0);
469 dx30 = _mm256_sub_pd(ix3,jx0);
470 dy30 = _mm256_sub_pd(iy3,jy0);
471 dz30 = _mm256_sub_pd(iz3,jz0);
473 /* Calculate squared distance and things based on it */
474 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
475 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
476 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
477 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
479 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
480 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
481 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
483 rinvsq00 = gmx_mm256_inv_pd(rsq00);
484 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
485 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
486 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
488 /* Load parameters for j particles */
489 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
490 charge+jnrC+0,charge+jnrD+0);
491 vdwjidx0A = 2*vdwtype[jnrA+0];
492 vdwjidx0B = 2*vdwtype[jnrB+0];
493 vdwjidx0C = 2*vdwtype[jnrC+0];
494 vdwjidx0D = 2*vdwtype[jnrD+0];
496 fjx0 = _mm256_setzero_pd();
497 fjy0 = _mm256_setzero_pd();
498 fjz0 = _mm256_setzero_pd();
500 /**************************
501 * CALCULATE INTERACTIONS *
502 **************************/
504 /* Compute parameters for interactions between i and j atoms */
505 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
506 vdwioffsetptr0+vdwjidx0B,
507 vdwioffsetptr0+vdwjidx0C,
508 vdwioffsetptr0+vdwjidx0D,
511 /* LENNARD-JONES DISPERSION/REPULSION */
513 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
514 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
515 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
516 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
517 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
519 /* Update potential sum for this i atom from the interaction with this j atom. */
520 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
521 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
525 fscal = _mm256_andnot_pd(dummy_mask,fscal);
527 /* Calculate temporary vectorial force */
528 tx = _mm256_mul_pd(fscal,dx00);
529 ty = _mm256_mul_pd(fscal,dy00);
530 tz = _mm256_mul_pd(fscal,dz00);
532 /* Update vectorial force */
533 fix0 = _mm256_add_pd(fix0,tx);
534 fiy0 = _mm256_add_pd(fiy0,ty);
535 fiz0 = _mm256_add_pd(fiz0,tz);
537 fjx0 = _mm256_add_pd(fjx0,tx);
538 fjy0 = _mm256_add_pd(fjy0,ty);
539 fjz0 = _mm256_add_pd(fjz0,tz);
541 /**************************
542 * CALCULATE INTERACTIONS *
543 **************************/
545 r10 = _mm256_mul_pd(rsq10,rinv10);
546 r10 = _mm256_andnot_pd(dummy_mask,r10);
548 /* Compute parameters for interactions between i and j atoms */
549 qq10 = _mm256_mul_pd(iq1,jq0);
551 /* EWALD ELECTROSTATICS */
553 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
554 ewrt = _mm256_mul_pd(r10,ewtabscale);
555 ewitab = _mm256_cvttpd_epi32(ewrt);
556 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
557 ewitab = _mm_slli_epi32(ewitab,2);
558 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
559 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
560 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
561 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
562 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
563 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
564 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
565 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
566 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velec = _mm256_andnot_pd(dummy_mask,velec);
570 velecsum = _mm256_add_pd(velecsum,velec);
574 fscal = _mm256_andnot_pd(dummy_mask,fscal);
576 /* Calculate temporary vectorial force */
577 tx = _mm256_mul_pd(fscal,dx10);
578 ty = _mm256_mul_pd(fscal,dy10);
579 tz = _mm256_mul_pd(fscal,dz10);
581 /* Update vectorial force */
582 fix1 = _mm256_add_pd(fix1,tx);
583 fiy1 = _mm256_add_pd(fiy1,ty);
584 fiz1 = _mm256_add_pd(fiz1,tz);
586 fjx0 = _mm256_add_pd(fjx0,tx);
587 fjy0 = _mm256_add_pd(fjy0,ty);
588 fjz0 = _mm256_add_pd(fjz0,tz);
590 /**************************
591 * CALCULATE INTERACTIONS *
592 **************************/
594 r20 = _mm256_mul_pd(rsq20,rinv20);
595 r20 = _mm256_andnot_pd(dummy_mask,r20);
597 /* Compute parameters for interactions between i and j atoms */
598 qq20 = _mm256_mul_pd(iq2,jq0);
600 /* EWALD ELECTROSTATICS */
602 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
603 ewrt = _mm256_mul_pd(r20,ewtabscale);
604 ewitab = _mm256_cvttpd_epi32(ewrt);
605 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
606 ewitab = _mm_slli_epi32(ewitab,2);
607 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
608 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
609 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
610 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
611 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
612 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
613 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
614 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
615 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
617 /* Update potential sum for this i atom from the interaction with this j atom. */
618 velec = _mm256_andnot_pd(dummy_mask,velec);
619 velecsum = _mm256_add_pd(velecsum,velec);
623 fscal = _mm256_andnot_pd(dummy_mask,fscal);
625 /* Calculate temporary vectorial force */
626 tx = _mm256_mul_pd(fscal,dx20);
627 ty = _mm256_mul_pd(fscal,dy20);
628 tz = _mm256_mul_pd(fscal,dz20);
630 /* Update vectorial force */
631 fix2 = _mm256_add_pd(fix2,tx);
632 fiy2 = _mm256_add_pd(fiy2,ty);
633 fiz2 = _mm256_add_pd(fiz2,tz);
635 fjx0 = _mm256_add_pd(fjx0,tx);
636 fjy0 = _mm256_add_pd(fjy0,ty);
637 fjz0 = _mm256_add_pd(fjz0,tz);
639 /**************************
640 * CALCULATE INTERACTIONS *
641 **************************/
643 r30 = _mm256_mul_pd(rsq30,rinv30);
644 r30 = _mm256_andnot_pd(dummy_mask,r30);
646 /* Compute parameters for interactions between i and j atoms */
647 qq30 = _mm256_mul_pd(iq3,jq0);
649 /* EWALD ELECTROSTATICS */
651 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
652 ewrt = _mm256_mul_pd(r30,ewtabscale);
653 ewitab = _mm256_cvttpd_epi32(ewrt);
654 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
655 ewitab = _mm_slli_epi32(ewitab,2);
656 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
657 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
658 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
659 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
660 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
661 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
662 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
663 velec = _mm256_mul_pd(qq30,_mm256_sub_pd(rinv30,velec));
664 felec = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
666 /* Update potential sum for this i atom from the interaction with this j atom. */
667 velec = _mm256_andnot_pd(dummy_mask,velec);
668 velecsum = _mm256_add_pd(velecsum,velec);
672 fscal = _mm256_andnot_pd(dummy_mask,fscal);
674 /* Calculate temporary vectorial force */
675 tx = _mm256_mul_pd(fscal,dx30);
676 ty = _mm256_mul_pd(fscal,dy30);
677 tz = _mm256_mul_pd(fscal,dz30);
679 /* Update vectorial force */
680 fix3 = _mm256_add_pd(fix3,tx);
681 fiy3 = _mm256_add_pd(fiy3,ty);
682 fiz3 = _mm256_add_pd(fiz3,tz);
684 fjx0 = _mm256_add_pd(fjx0,tx);
685 fjy0 = _mm256_add_pd(fjy0,ty);
686 fjz0 = _mm256_add_pd(fjz0,tz);
688 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
689 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
690 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
691 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
693 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
695 /* Inner loop uses 161 flops */
698 /* End of innermost loop */
700 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
701 f+i_coord_offset,fshift+i_shift_offset);
704 /* Update potential energies */
705 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
706 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
708 /* Increment number of inner iterations */
709 inneriter += j_index_end - j_index_start;
711 /* Outer loop uses 26 flops */
714 /* Increment number of outer iterations */
717 /* Update outer/inner flops */
719 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*161);
722 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_256_double
723 * Electrostatics interaction: Ewald
724 * VdW interaction: LennardJones
725 * Geometry: Water4-Particle
726 * Calculate force/pot: Force
729 nb_kernel_ElecEw_VdwLJ_GeomW4P1_F_avx_256_double
730 (t_nblist * gmx_restrict nlist,
731 rvec * gmx_restrict xx,
732 rvec * gmx_restrict ff,
733 t_forcerec * gmx_restrict fr,
734 t_mdatoms * gmx_restrict mdatoms,
735 nb_kernel_data_t * gmx_restrict kernel_data,
736 t_nrnb * gmx_restrict nrnb)
738 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
739 * just 0 for non-waters.
740 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
741 * jnr indices corresponding to data put in the four positions in the SIMD register.
743 int i_shift_offset,i_coord_offset,outeriter,inneriter;
744 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
745 int jnrA,jnrB,jnrC,jnrD;
746 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
747 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
748 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
749 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
751 real *shiftvec,*fshift,*x,*f;
752 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
754 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
755 real * vdwioffsetptr0;
756 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
757 real * vdwioffsetptr1;
758 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
759 real * vdwioffsetptr2;
760 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
761 real * vdwioffsetptr3;
762 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
763 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
764 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
765 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
766 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
767 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
768 __m256d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
769 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
772 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
775 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
776 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
778 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
779 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
781 __m256d dummy_mask,cutoff_mask;
782 __m128 tmpmask0,tmpmask1;
783 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
784 __m256d one = _mm256_set1_pd(1.0);
785 __m256d two = _mm256_set1_pd(2.0);
791 jindex = nlist->jindex;
793 shiftidx = nlist->shift;
795 shiftvec = fr->shift_vec[0];
796 fshift = fr->fshift[0];
797 facel = _mm256_set1_pd(fr->epsfac);
798 charge = mdatoms->chargeA;
799 nvdwtype = fr->ntype;
801 vdwtype = mdatoms->typeA;
803 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
804 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
805 beta2 = _mm256_mul_pd(beta,beta);
806 beta3 = _mm256_mul_pd(beta,beta2);
808 ewtab = fr->ic->tabq_coul_F;
809 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
810 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
812 /* Setup water-specific parameters */
813 inr = nlist->iinr[0];
814 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
815 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
816 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
817 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
819 /* Avoid stupid compiler warnings */
820 jnrA = jnrB = jnrC = jnrD = 0;
829 for(iidx=0;iidx<4*DIM;iidx++)
834 /* Start outer loop over neighborlists */
835 for(iidx=0; iidx<nri; iidx++)
837 /* Load shift vector for this list */
838 i_shift_offset = DIM*shiftidx[iidx];
840 /* Load limits for loop over neighbors */
841 j_index_start = jindex[iidx];
842 j_index_end = jindex[iidx+1];
844 /* Get outer coordinate index */
846 i_coord_offset = DIM*inr;
848 /* Load i particle coords and add shift vector */
849 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
850 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
852 fix0 = _mm256_setzero_pd();
853 fiy0 = _mm256_setzero_pd();
854 fiz0 = _mm256_setzero_pd();
855 fix1 = _mm256_setzero_pd();
856 fiy1 = _mm256_setzero_pd();
857 fiz1 = _mm256_setzero_pd();
858 fix2 = _mm256_setzero_pd();
859 fiy2 = _mm256_setzero_pd();
860 fiz2 = _mm256_setzero_pd();
861 fix3 = _mm256_setzero_pd();
862 fiy3 = _mm256_setzero_pd();
863 fiz3 = _mm256_setzero_pd();
865 /* Start inner kernel loop */
866 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
869 /* Get j neighbor index, and coordinate index */
874 j_coord_offsetA = DIM*jnrA;
875 j_coord_offsetB = DIM*jnrB;
876 j_coord_offsetC = DIM*jnrC;
877 j_coord_offsetD = DIM*jnrD;
879 /* load j atom coordinates */
880 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
881 x+j_coord_offsetC,x+j_coord_offsetD,
884 /* Calculate displacement vector */
885 dx00 = _mm256_sub_pd(ix0,jx0);
886 dy00 = _mm256_sub_pd(iy0,jy0);
887 dz00 = _mm256_sub_pd(iz0,jz0);
888 dx10 = _mm256_sub_pd(ix1,jx0);
889 dy10 = _mm256_sub_pd(iy1,jy0);
890 dz10 = _mm256_sub_pd(iz1,jz0);
891 dx20 = _mm256_sub_pd(ix2,jx0);
892 dy20 = _mm256_sub_pd(iy2,jy0);
893 dz20 = _mm256_sub_pd(iz2,jz0);
894 dx30 = _mm256_sub_pd(ix3,jx0);
895 dy30 = _mm256_sub_pd(iy3,jy0);
896 dz30 = _mm256_sub_pd(iz3,jz0);
898 /* Calculate squared distance and things based on it */
899 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
900 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
901 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
902 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
904 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
905 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
906 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
908 rinvsq00 = gmx_mm256_inv_pd(rsq00);
909 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
910 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
911 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
913 /* Load parameters for j particles */
914 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
915 charge+jnrC+0,charge+jnrD+0);
916 vdwjidx0A = 2*vdwtype[jnrA+0];
917 vdwjidx0B = 2*vdwtype[jnrB+0];
918 vdwjidx0C = 2*vdwtype[jnrC+0];
919 vdwjidx0D = 2*vdwtype[jnrD+0];
921 fjx0 = _mm256_setzero_pd();
922 fjy0 = _mm256_setzero_pd();
923 fjz0 = _mm256_setzero_pd();
925 /**************************
926 * CALCULATE INTERACTIONS *
927 **************************/
929 /* Compute parameters for interactions between i and j atoms */
930 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
931 vdwioffsetptr0+vdwjidx0B,
932 vdwioffsetptr0+vdwjidx0C,
933 vdwioffsetptr0+vdwjidx0D,
936 /* LENNARD-JONES DISPERSION/REPULSION */
938 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
939 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
943 /* Calculate temporary vectorial force */
944 tx = _mm256_mul_pd(fscal,dx00);
945 ty = _mm256_mul_pd(fscal,dy00);
946 tz = _mm256_mul_pd(fscal,dz00);
948 /* Update vectorial force */
949 fix0 = _mm256_add_pd(fix0,tx);
950 fiy0 = _mm256_add_pd(fiy0,ty);
951 fiz0 = _mm256_add_pd(fiz0,tz);
953 fjx0 = _mm256_add_pd(fjx0,tx);
954 fjy0 = _mm256_add_pd(fjy0,ty);
955 fjz0 = _mm256_add_pd(fjz0,tz);
957 /**************************
958 * CALCULATE INTERACTIONS *
959 **************************/
961 r10 = _mm256_mul_pd(rsq10,rinv10);
963 /* Compute parameters for interactions between i and j atoms */
964 qq10 = _mm256_mul_pd(iq1,jq0);
966 /* EWALD ELECTROSTATICS */
968 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
969 ewrt = _mm256_mul_pd(r10,ewtabscale);
970 ewitab = _mm256_cvttpd_epi32(ewrt);
971 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
972 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
973 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
975 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
976 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
980 /* Calculate temporary vectorial force */
981 tx = _mm256_mul_pd(fscal,dx10);
982 ty = _mm256_mul_pd(fscal,dy10);
983 tz = _mm256_mul_pd(fscal,dz10);
985 /* Update vectorial force */
986 fix1 = _mm256_add_pd(fix1,tx);
987 fiy1 = _mm256_add_pd(fiy1,ty);
988 fiz1 = _mm256_add_pd(fiz1,tz);
990 fjx0 = _mm256_add_pd(fjx0,tx);
991 fjy0 = _mm256_add_pd(fjy0,ty);
992 fjz0 = _mm256_add_pd(fjz0,tz);
994 /**************************
995 * CALCULATE INTERACTIONS *
996 **************************/
998 r20 = _mm256_mul_pd(rsq20,rinv20);
1000 /* Compute parameters for interactions between i and j atoms */
1001 qq20 = _mm256_mul_pd(iq2,jq0);
1003 /* EWALD ELECTROSTATICS */
1005 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1006 ewrt = _mm256_mul_pd(r20,ewtabscale);
1007 ewitab = _mm256_cvttpd_epi32(ewrt);
1008 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1009 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1010 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1012 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1013 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1017 /* Calculate temporary vectorial force */
1018 tx = _mm256_mul_pd(fscal,dx20);
1019 ty = _mm256_mul_pd(fscal,dy20);
1020 tz = _mm256_mul_pd(fscal,dz20);
1022 /* Update vectorial force */
1023 fix2 = _mm256_add_pd(fix2,tx);
1024 fiy2 = _mm256_add_pd(fiy2,ty);
1025 fiz2 = _mm256_add_pd(fiz2,tz);
1027 fjx0 = _mm256_add_pd(fjx0,tx);
1028 fjy0 = _mm256_add_pd(fjy0,ty);
1029 fjz0 = _mm256_add_pd(fjz0,tz);
1031 /**************************
1032 * CALCULATE INTERACTIONS *
1033 **************************/
1035 r30 = _mm256_mul_pd(rsq30,rinv30);
1037 /* Compute parameters for interactions between i and j atoms */
1038 qq30 = _mm256_mul_pd(iq3,jq0);
1040 /* EWALD ELECTROSTATICS */
1042 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1043 ewrt = _mm256_mul_pd(r30,ewtabscale);
1044 ewitab = _mm256_cvttpd_epi32(ewrt);
1045 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1046 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1047 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1049 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1050 felec = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
1054 /* Calculate temporary vectorial force */
1055 tx = _mm256_mul_pd(fscal,dx30);
1056 ty = _mm256_mul_pd(fscal,dy30);
1057 tz = _mm256_mul_pd(fscal,dz30);
1059 /* Update vectorial force */
1060 fix3 = _mm256_add_pd(fix3,tx);
1061 fiy3 = _mm256_add_pd(fiy3,ty);
1062 fiz3 = _mm256_add_pd(fiz3,tz);
1064 fjx0 = _mm256_add_pd(fjx0,tx);
1065 fjy0 = _mm256_add_pd(fjy0,ty);
1066 fjz0 = _mm256_add_pd(fjz0,tz);
1068 fjptrA = f+j_coord_offsetA;
1069 fjptrB = f+j_coord_offsetB;
1070 fjptrC = f+j_coord_offsetC;
1071 fjptrD = f+j_coord_offsetD;
1073 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1075 /* Inner loop uses 138 flops */
1078 if(jidx<j_index_end)
1081 /* Get j neighbor index, and coordinate index */
1082 jnrlistA = jjnr[jidx];
1083 jnrlistB = jjnr[jidx+1];
1084 jnrlistC = jjnr[jidx+2];
1085 jnrlistD = jjnr[jidx+3];
1086 /* Sign of each element will be negative for non-real atoms.
1087 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1088 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1090 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1092 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1093 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1094 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1096 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1097 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1098 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1099 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1100 j_coord_offsetA = DIM*jnrA;
1101 j_coord_offsetB = DIM*jnrB;
1102 j_coord_offsetC = DIM*jnrC;
1103 j_coord_offsetD = DIM*jnrD;
1105 /* load j atom coordinates */
1106 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1107 x+j_coord_offsetC,x+j_coord_offsetD,
1110 /* Calculate displacement vector */
1111 dx00 = _mm256_sub_pd(ix0,jx0);
1112 dy00 = _mm256_sub_pd(iy0,jy0);
1113 dz00 = _mm256_sub_pd(iz0,jz0);
1114 dx10 = _mm256_sub_pd(ix1,jx0);
1115 dy10 = _mm256_sub_pd(iy1,jy0);
1116 dz10 = _mm256_sub_pd(iz1,jz0);
1117 dx20 = _mm256_sub_pd(ix2,jx0);
1118 dy20 = _mm256_sub_pd(iy2,jy0);
1119 dz20 = _mm256_sub_pd(iz2,jz0);
1120 dx30 = _mm256_sub_pd(ix3,jx0);
1121 dy30 = _mm256_sub_pd(iy3,jy0);
1122 dz30 = _mm256_sub_pd(iz3,jz0);
1124 /* Calculate squared distance and things based on it */
1125 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1126 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1127 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1128 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
1130 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1131 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1132 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
1134 rinvsq00 = gmx_mm256_inv_pd(rsq00);
1135 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1136 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1137 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
1139 /* Load parameters for j particles */
1140 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1141 charge+jnrC+0,charge+jnrD+0);
1142 vdwjidx0A = 2*vdwtype[jnrA+0];
1143 vdwjidx0B = 2*vdwtype[jnrB+0];
1144 vdwjidx0C = 2*vdwtype[jnrC+0];
1145 vdwjidx0D = 2*vdwtype[jnrD+0];
1147 fjx0 = _mm256_setzero_pd();
1148 fjy0 = _mm256_setzero_pd();
1149 fjz0 = _mm256_setzero_pd();
1151 /**************************
1152 * CALCULATE INTERACTIONS *
1153 **************************/
1155 /* Compute parameters for interactions between i and j atoms */
1156 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1157 vdwioffsetptr0+vdwjidx0B,
1158 vdwioffsetptr0+vdwjidx0C,
1159 vdwioffsetptr0+vdwjidx0D,
1162 /* LENNARD-JONES DISPERSION/REPULSION */
1164 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1165 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1169 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1171 /* Calculate temporary vectorial force */
1172 tx = _mm256_mul_pd(fscal,dx00);
1173 ty = _mm256_mul_pd(fscal,dy00);
1174 tz = _mm256_mul_pd(fscal,dz00);
1176 /* Update vectorial force */
1177 fix0 = _mm256_add_pd(fix0,tx);
1178 fiy0 = _mm256_add_pd(fiy0,ty);
1179 fiz0 = _mm256_add_pd(fiz0,tz);
1181 fjx0 = _mm256_add_pd(fjx0,tx);
1182 fjy0 = _mm256_add_pd(fjy0,ty);
1183 fjz0 = _mm256_add_pd(fjz0,tz);
1185 /**************************
1186 * CALCULATE INTERACTIONS *
1187 **************************/
1189 r10 = _mm256_mul_pd(rsq10,rinv10);
1190 r10 = _mm256_andnot_pd(dummy_mask,r10);
1192 /* Compute parameters for interactions between i and j atoms */
1193 qq10 = _mm256_mul_pd(iq1,jq0);
1195 /* EWALD ELECTROSTATICS */
1197 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1198 ewrt = _mm256_mul_pd(r10,ewtabscale);
1199 ewitab = _mm256_cvttpd_epi32(ewrt);
1200 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1201 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1202 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1204 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1205 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1209 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1211 /* Calculate temporary vectorial force */
1212 tx = _mm256_mul_pd(fscal,dx10);
1213 ty = _mm256_mul_pd(fscal,dy10);
1214 tz = _mm256_mul_pd(fscal,dz10);
1216 /* Update vectorial force */
1217 fix1 = _mm256_add_pd(fix1,tx);
1218 fiy1 = _mm256_add_pd(fiy1,ty);
1219 fiz1 = _mm256_add_pd(fiz1,tz);
1221 fjx0 = _mm256_add_pd(fjx0,tx);
1222 fjy0 = _mm256_add_pd(fjy0,ty);
1223 fjz0 = _mm256_add_pd(fjz0,tz);
1225 /**************************
1226 * CALCULATE INTERACTIONS *
1227 **************************/
1229 r20 = _mm256_mul_pd(rsq20,rinv20);
1230 r20 = _mm256_andnot_pd(dummy_mask,r20);
1232 /* Compute parameters for interactions between i and j atoms */
1233 qq20 = _mm256_mul_pd(iq2,jq0);
1235 /* EWALD ELECTROSTATICS */
1237 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1238 ewrt = _mm256_mul_pd(r20,ewtabscale);
1239 ewitab = _mm256_cvttpd_epi32(ewrt);
1240 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1241 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1242 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1244 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1245 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1249 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1251 /* Calculate temporary vectorial force */
1252 tx = _mm256_mul_pd(fscal,dx20);
1253 ty = _mm256_mul_pd(fscal,dy20);
1254 tz = _mm256_mul_pd(fscal,dz20);
1256 /* Update vectorial force */
1257 fix2 = _mm256_add_pd(fix2,tx);
1258 fiy2 = _mm256_add_pd(fiy2,ty);
1259 fiz2 = _mm256_add_pd(fiz2,tz);
1261 fjx0 = _mm256_add_pd(fjx0,tx);
1262 fjy0 = _mm256_add_pd(fjy0,ty);
1263 fjz0 = _mm256_add_pd(fjz0,tz);
1265 /**************************
1266 * CALCULATE INTERACTIONS *
1267 **************************/
1269 r30 = _mm256_mul_pd(rsq30,rinv30);
1270 r30 = _mm256_andnot_pd(dummy_mask,r30);
1272 /* Compute parameters for interactions between i and j atoms */
1273 qq30 = _mm256_mul_pd(iq3,jq0);
1275 /* EWALD ELECTROSTATICS */
1277 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1278 ewrt = _mm256_mul_pd(r30,ewtabscale);
1279 ewitab = _mm256_cvttpd_epi32(ewrt);
1280 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1281 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1282 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1284 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1285 felec = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
1289 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1291 /* Calculate temporary vectorial force */
1292 tx = _mm256_mul_pd(fscal,dx30);
1293 ty = _mm256_mul_pd(fscal,dy30);
1294 tz = _mm256_mul_pd(fscal,dz30);
1296 /* Update vectorial force */
1297 fix3 = _mm256_add_pd(fix3,tx);
1298 fiy3 = _mm256_add_pd(fiy3,ty);
1299 fiz3 = _mm256_add_pd(fiz3,tz);
1301 fjx0 = _mm256_add_pd(fjx0,tx);
1302 fjy0 = _mm256_add_pd(fjy0,ty);
1303 fjz0 = _mm256_add_pd(fjz0,tz);
1305 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1306 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1307 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1308 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1310 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1312 /* Inner loop uses 141 flops */
1315 /* End of innermost loop */
1317 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1318 f+i_coord_offset,fshift+i_shift_offset);
1320 /* Increment number of inner iterations */
1321 inneriter += j_index_end - j_index_start;
1323 /* Outer loop uses 24 flops */
1326 /* Increment number of outer iterations */
1329 /* Update outer/inner flops */
1331 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*141);