2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: LennardJones
54 * Geometry: Water3-Particle
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
94 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
95 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
98 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
101 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
102 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
104 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
105 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
107 __m256d dummy_mask,cutoff_mask;
108 __m128 tmpmask0,tmpmask1;
109 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
110 __m256d one = _mm256_set1_pd(1.0);
111 __m256d two = _mm256_set1_pd(2.0);
117 jindex = nlist->jindex;
119 shiftidx = nlist->shift;
121 shiftvec = fr->shift_vec[0];
122 fshift = fr->fshift[0];
123 facel = _mm256_set1_pd(fr->epsfac);
124 charge = mdatoms->chargeA;
125 nvdwtype = fr->ntype;
127 vdwtype = mdatoms->typeA;
129 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
130 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
131 beta2 = _mm256_mul_pd(beta,beta);
132 beta3 = _mm256_mul_pd(beta,beta2);
134 ewtab = fr->ic->tabq_coul_FDV0;
135 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
136 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
138 /* Setup water-specific parameters */
139 inr = nlist->iinr[0];
140 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
141 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
142 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
143 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
145 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
146 rcutoff_scalar = fr->rcoulomb;
147 rcutoff = _mm256_set1_pd(rcutoff_scalar);
148 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
150 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
151 rvdw = _mm256_set1_pd(fr->rvdw);
153 /* Avoid stupid compiler warnings */
154 jnrA = jnrB = jnrC = jnrD = 0;
163 for(iidx=0;iidx<4*DIM;iidx++)
168 /* Start outer loop over neighborlists */
169 for(iidx=0; iidx<nri; iidx++)
171 /* Load shift vector for this list */
172 i_shift_offset = DIM*shiftidx[iidx];
174 /* Load limits for loop over neighbors */
175 j_index_start = jindex[iidx];
176 j_index_end = jindex[iidx+1];
178 /* Get outer coordinate index */
180 i_coord_offset = DIM*inr;
182 /* Load i particle coords and add shift vector */
183 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
184 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
186 fix0 = _mm256_setzero_pd();
187 fiy0 = _mm256_setzero_pd();
188 fiz0 = _mm256_setzero_pd();
189 fix1 = _mm256_setzero_pd();
190 fiy1 = _mm256_setzero_pd();
191 fiz1 = _mm256_setzero_pd();
192 fix2 = _mm256_setzero_pd();
193 fiy2 = _mm256_setzero_pd();
194 fiz2 = _mm256_setzero_pd();
196 /* Reset potential sums */
197 velecsum = _mm256_setzero_pd();
198 vvdwsum = _mm256_setzero_pd();
200 /* Start inner kernel loop */
201 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
204 /* Get j neighbor index, and coordinate index */
209 j_coord_offsetA = DIM*jnrA;
210 j_coord_offsetB = DIM*jnrB;
211 j_coord_offsetC = DIM*jnrC;
212 j_coord_offsetD = DIM*jnrD;
214 /* load j atom coordinates */
215 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
216 x+j_coord_offsetC,x+j_coord_offsetD,
219 /* Calculate displacement vector */
220 dx00 = _mm256_sub_pd(ix0,jx0);
221 dy00 = _mm256_sub_pd(iy0,jy0);
222 dz00 = _mm256_sub_pd(iz0,jz0);
223 dx10 = _mm256_sub_pd(ix1,jx0);
224 dy10 = _mm256_sub_pd(iy1,jy0);
225 dz10 = _mm256_sub_pd(iz1,jz0);
226 dx20 = _mm256_sub_pd(ix2,jx0);
227 dy20 = _mm256_sub_pd(iy2,jy0);
228 dz20 = _mm256_sub_pd(iz2,jz0);
230 /* Calculate squared distance and things based on it */
231 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
232 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
233 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
235 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
236 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
237 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
239 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
240 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
241 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
243 /* Load parameters for j particles */
244 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
245 charge+jnrC+0,charge+jnrD+0);
246 vdwjidx0A = 2*vdwtype[jnrA+0];
247 vdwjidx0B = 2*vdwtype[jnrB+0];
248 vdwjidx0C = 2*vdwtype[jnrC+0];
249 vdwjidx0D = 2*vdwtype[jnrD+0];
251 fjx0 = _mm256_setzero_pd();
252 fjy0 = _mm256_setzero_pd();
253 fjz0 = _mm256_setzero_pd();
255 /**************************
256 * CALCULATE INTERACTIONS *
257 **************************/
259 if (gmx_mm256_any_lt(rsq00,rcutoff2))
262 r00 = _mm256_mul_pd(rsq00,rinv00);
264 /* Compute parameters for interactions between i and j atoms */
265 qq00 = _mm256_mul_pd(iq0,jq0);
266 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
267 vdwioffsetptr0+vdwjidx0B,
268 vdwioffsetptr0+vdwjidx0C,
269 vdwioffsetptr0+vdwjidx0D,
272 /* EWALD ELECTROSTATICS */
274 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
275 ewrt = _mm256_mul_pd(r00,ewtabscale);
276 ewitab = _mm256_cvttpd_epi32(ewrt);
277 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
278 ewitab = _mm_slli_epi32(ewitab,2);
279 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
280 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
281 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
282 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
283 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
284 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
285 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
286 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
287 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
289 /* LENNARD-JONES DISPERSION/REPULSION */
291 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
292 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
293 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
294 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
295 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
296 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
298 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
300 /* Update potential sum for this i atom from the interaction with this j atom. */
301 velec = _mm256_and_pd(velec,cutoff_mask);
302 velecsum = _mm256_add_pd(velecsum,velec);
303 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
304 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
306 fscal = _mm256_add_pd(felec,fvdw);
308 fscal = _mm256_and_pd(fscal,cutoff_mask);
310 /* Calculate temporary vectorial force */
311 tx = _mm256_mul_pd(fscal,dx00);
312 ty = _mm256_mul_pd(fscal,dy00);
313 tz = _mm256_mul_pd(fscal,dz00);
315 /* Update vectorial force */
316 fix0 = _mm256_add_pd(fix0,tx);
317 fiy0 = _mm256_add_pd(fiy0,ty);
318 fiz0 = _mm256_add_pd(fiz0,tz);
320 fjx0 = _mm256_add_pd(fjx0,tx);
321 fjy0 = _mm256_add_pd(fjy0,ty);
322 fjz0 = _mm256_add_pd(fjz0,tz);
326 /**************************
327 * CALCULATE INTERACTIONS *
328 **************************/
330 if (gmx_mm256_any_lt(rsq10,rcutoff2))
333 r10 = _mm256_mul_pd(rsq10,rinv10);
335 /* Compute parameters for interactions between i and j atoms */
336 qq10 = _mm256_mul_pd(iq1,jq0);
338 /* EWALD ELECTROSTATICS */
340 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
341 ewrt = _mm256_mul_pd(r10,ewtabscale);
342 ewitab = _mm256_cvttpd_epi32(ewrt);
343 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
344 ewitab = _mm_slli_epi32(ewitab,2);
345 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
346 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
347 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
348 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
349 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
350 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
351 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
352 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
353 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
355 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
357 /* Update potential sum for this i atom from the interaction with this j atom. */
358 velec = _mm256_and_pd(velec,cutoff_mask);
359 velecsum = _mm256_add_pd(velecsum,velec);
363 fscal = _mm256_and_pd(fscal,cutoff_mask);
365 /* Calculate temporary vectorial force */
366 tx = _mm256_mul_pd(fscal,dx10);
367 ty = _mm256_mul_pd(fscal,dy10);
368 tz = _mm256_mul_pd(fscal,dz10);
370 /* Update vectorial force */
371 fix1 = _mm256_add_pd(fix1,tx);
372 fiy1 = _mm256_add_pd(fiy1,ty);
373 fiz1 = _mm256_add_pd(fiz1,tz);
375 fjx0 = _mm256_add_pd(fjx0,tx);
376 fjy0 = _mm256_add_pd(fjy0,ty);
377 fjz0 = _mm256_add_pd(fjz0,tz);
381 /**************************
382 * CALCULATE INTERACTIONS *
383 **************************/
385 if (gmx_mm256_any_lt(rsq20,rcutoff2))
388 r20 = _mm256_mul_pd(rsq20,rinv20);
390 /* Compute parameters for interactions between i and j atoms */
391 qq20 = _mm256_mul_pd(iq2,jq0);
393 /* EWALD ELECTROSTATICS */
395 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
396 ewrt = _mm256_mul_pd(r20,ewtabscale);
397 ewitab = _mm256_cvttpd_epi32(ewrt);
398 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
399 ewitab = _mm_slli_epi32(ewitab,2);
400 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
401 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
402 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
403 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
404 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
405 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
406 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
407 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
408 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
410 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
412 /* Update potential sum for this i atom from the interaction with this j atom. */
413 velec = _mm256_and_pd(velec,cutoff_mask);
414 velecsum = _mm256_add_pd(velecsum,velec);
418 fscal = _mm256_and_pd(fscal,cutoff_mask);
420 /* Calculate temporary vectorial force */
421 tx = _mm256_mul_pd(fscal,dx20);
422 ty = _mm256_mul_pd(fscal,dy20);
423 tz = _mm256_mul_pd(fscal,dz20);
425 /* Update vectorial force */
426 fix2 = _mm256_add_pd(fix2,tx);
427 fiy2 = _mm256_add_pd(fiy2,ty);
428 fiz2 = _mm256_add_pd(fiz2,tz);
430 fjx0 = _mm256_add_pd(fjx0,tx);
431 fjy0 = _mm256_add_pd(fjy0,ty);
432 fjz0 = _mm256_add_pd(fjz0,tz);
436 fjptrA = f+j_coord_offsetA;
437 fjptrB = f+j_coord_offsetB;
438 fjptrC = f+j_coord_offsetC;
439 fjptrD = f+j_coord_offsetD;
441 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
443 /* Inner loop uses 159 flops */
449 /* Get j neighbor index, and coordinate index */
450 jnrlistA = jjnr[jidx];
451 jnrlistB = jjnr[jidx+1];
452 jnrlistC = jjnr[jidx+2];
453 jnrlistD = jjnr[jidx+3];
454 /* Sign of each element will be negative for non-real atoms.
455 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
456 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
458 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
460 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
461 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
462 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
464 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
465 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
466 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
467 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
468 j_coord_offsetA = DIM*jnrA;
469 j_coord_offsetB = DIM*jnrB;
470 j_coord_offsetC = DIM*jnrC;
471 j_coord_offsetD = DIM*jnrD;
473 /* load j atom coordinates */
474 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
475 x+j_coord_offsetC,x+j_coord_offsetD,
478 /* Calculate displacement vector */
479 dx00 = _mm256_sub_pd(ix0,jx0);
480 dy00 = _mm256_sub_pd(iy0,jy0);
481 dz00 = _mm256_sub_pd(iz0,jz0);
482 dx10 = _mm256_sub_pd(ix1,jx0);
483 dy10 = _mm256_sub_pd(iy1,jy0);
484 dz10 = _mm256_sub_pd(iz1,jz0);
485 dx20 = _mm256_sub_pd(ix2,jx0);
486 dy20 = _mm256_sub_pd(iy2,jy0);
487 dz20 = _mm256_sub_pd(iz2,jz0);
489 /* Calculate squared distance and things based on it */
490 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
491 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
492 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
494 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
495 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
496 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
498 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
499 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
500 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
502 /* Load parameters for j particles */
503 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
504 charge+jnrC+0,charge+jnrD+0);
505 vdwjidx0A = 2*vdwtype[jnrA+0];
506 vdwjidx0B = 2*vdwtype[jnrB+0];
507 vdwjidx0C = 2*vdwtype[jnrC+0];
508 vdwjidx0D = 2*vdwtype[jnrD+0];
510 fjx0 = _mm256_setzero_pd();
511 fjy0 = _mm256_setzero_pd();
512 fjz0 = _mm256_setzero_pd();
514 /**************************
515 * CALCULATE INTERACTIONS *
516 **************************/
518 if (gmx_mm256_any_lt(rsq00,rcutoff2))
521 r00 = _mm256_mul_pd(rsq00,rinv00);
522 r00 = _mm256_andnot_pd(dummy_mask,r00);
524 /* Compute parameters for interactions between i and j atoms */
525 qq00 = _mm256_mul_pd(iq0,jq0);
526 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
527 vdwioffsetptr0+vdwjidx0B,
528 vdwioffsetptr0+vdwjidx0C,
529 vdwioffsetptr0+vdwjidx0D,
532 /* EWALD ELECTROSTATICS */
534 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
535 ewrt = _mm256_mul_pd(r00,ewtabscale);
536 ewitab = _mm256_cvttpd_epi32(ewrt);
537 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
538 ewitab = _mm_slli_epi32(ewitab,2);
539 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
540 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
541 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
542 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
543 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
544 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
545 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
546 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(_mm256_sub_pd(rinv00,sh_ewald),velec));
547 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
549 /* LENNARD-JONES DISPERSION/REPULSION */
551 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
552 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
553 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
554 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
555 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
556 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
558 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
560 /* Update potential sum for this i atom from the interaction with this j atom. */
561 velec = _mm256_and_pd(velec,cutoff_mask);
562 velec = _mm256_andnot_pd(dummy_mask,velec);
563 velecsum = _mm256_add_pd(velecsum,velec);
564 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
565 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
566 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
568 fscal = _mm256_add_pd(felec,fvdw);
570 fscal = _mm256_and_pd(fscal,cutoff_mask);
572 fscal = _mm256_andnot_pd(dummy_mask,fscal);
574 /* Calculate temporary vectorial force */
575 tx = _mm256_mul_pd(fscal,dx00);
576 ty = _mm256_mul_pd(fscal,dy00);
577 tz = _mm256_mul_pd(fscal,dz00);
579 /* Update vectorial force */
580 fix0 = _mm256_add_pd(fix0,tx);
581 fiy0 = _mm256_add_pd(fiy0,ty);
582 fiz0 = _mm256_add_pd(fiz0,tz);
584 fjx0 = _mm256_add_pd(fjx0,tx);
585 fjy0 = _mm256_add_pd(fjy0,ty);
586 fjz0 = _mm256_add_pd(fjz0,tz);
590 /**************************
591 * CALCULATE INTERACTIONS *
592 **************************/
594 if (gmx_mm256_any_lt(rsq10,rcutoff2))
597 r10 = _mm256_mul_pd(rsq10,rinv10);
598 r10 = _mm256_andnot_pd(dummy_mask,r10);
600 /* Compute parameters for interactions between i and j atoms */
601 qq10 = _mm256_mul_pd(iq1,jq0);
603 /* EWALD ELECTROSTATICS */
605 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
606 ewrt = _mm256_mul_pd(r10,ewtabscale);
607 ewitab = _mm256_cvttpd_epi32(ewrt);
608 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
609 ewitab = _mm_slli_epi32(ewitab,2);
610 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
611 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
612 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
613 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
614 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
615 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
616 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
617 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
618 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
620 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
622 /* Update potential sum for this i atom from the interaction with this j atom. */
623 velec = _mm256_and_pd(velec,cutoff_mask);
624 velec = _mm256_andnot_pd(dummy_mask,velec);
625 velecsum = _mm256_add_pd(velecsum,velec);
629 fscal = _mm256_and_pd(fscal,cutoff_mask);
631 fscal = _mm256_andnot_pd(dummy_mask,fscal);
633 /* Calculate temporary vectorial force */
634 tx = _mm256_mul_pd(fscal,dx10);
635 ty = _mm256_mul_pd(fscal,dy10);
636 tz = _mm256_mul_pd(fscal,dz10);
638 /* Update vectorial force */
639 fix1 = _mm256_add_pd(fix1,tx);
640 fiy1 = _mm256_add_pd(fiy1,ty);
641 fiz1 = _mm256_add_pd(fiz1,tz);
643 fjx0 = _mm256_add_pd(fjx0,tx);
644 fjy0 = _mm256_add_pd(fjy0,ty);
645 fjz0 = _mm256_add_pd(fjz0,tz);
649 /**************************
650 * CALCULATE INTERACTIONS *
651 **************************/
653 if (gmx_mm256_any_lt(rsq20,rcutoff2))
656 r20 = _mm256_mul_pd(rsq20,rinv20);
657 r20 = _mm256_andnot_pd(dummy_mask,r20);
659 /* Compute parameters for interactions between i and j atoms */
660 qq20 = _mm256_mul_pd(iq2,jq0);
662 /* EWALD ELECTROSTATICS */
664 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
665 ewrt = _mm256_mul_pd(r20,ewtabscale);
666 ewitab = _mm256_cvttpd_epi32(ewrt);
667 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
668 ewitab = _mm_slli_epi32(ewitab,2);
669 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
670 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
671 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
672 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
673 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
674 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
675 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
676 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
677 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
679 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
681 /* Update potential sum for this i atom from the interaction with this j atom. */
682 velec = _mm256_and_pd(velec,cutoff_mask);
683 velec = _mm256_andnot_pd(dummy_mask,velec);
684 velecsum = _mm256_add_pd(velecsum,velec);
688 fscal = _mm256_and_pd(fscal,cutoff_mask);
690 fscal = _mm256_andnot_pd(dummy_mask,fscal);
692 /* Calculate temporary vectorial force */
693 tx = _mm256_mul_pd(fscal,dx20);
694 ty = _mm256_mul_pd(fscal,dy20);
695 tz = _mm256_mul_pd(fscal,dz20);
697 /* Update vectorial force */
698 fix2 = _mm256_add_pd(fix2,tx);
699 fiy2 = _mm256_add_pd(fiy2,ty);
700 fiz2 = _mm256_add_pd(fiz2,tz);
702 fjx0 = _mm256_add_pd(fjx0,tx);
703 fjy0 = _mm256_add_pd(fjy0,ty);
704 fjz0 = _mm256_add_pd(fjz0,tz);
708 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
709 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
710 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
711 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
713 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
715 /* Inner loop uses 162 flops */
718 /* End of innermost loop */
720 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
721 f+i_coord_offset,fshift+i_shift_offset);
724 /* Update potential energies */
725 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
726 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
728 /* Increment number of inner iterations */
729 inneriter += j_index_end - j_index_start;
731 /* Outer loop uses 20 flops */
734 /* Increment number of outer iterations */
737 /* Update outer/inner flops */
739 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*162);
742 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_256_double
743 * Electrostatics interaction: Ewald
744 * VdW interaction: LennardJones
745 * Geometry: Water3-Particle
746 * Calculate force/pot: Force
749 nb_kernel_ElecEwSh_VdwLJSh_GeomW3P1_F_avx_256_double
750 (t_nblist * gmx_restrict nlist,
751 rvec * gmx_restrict xx,
752 rvec * gmx_restrict ff,
753 t_forcerec * gmx_restrict fr,
754 t_mdatoms * gmx_restrict mdatoms,
755 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
756 t_nrnb * gmx_restrict nrnb)
758 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
759 * just 0 for non-waters.
760 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
761 * jnr indices corresponding to data put in the four positions in the SIMD register.
763 int i_shift_offset,i_coord_offset,outeriter,inneriter;
764 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
765 int jnrA,jnrB,jnrC,jnrD;
766 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
767 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
768 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
769 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
771 real *shiftvec,*fshift,*x,*f;
772 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
774 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
775 real * vdwioffsetptr0;
776 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
777 real * vdwioffsetptr1;
778 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
779 real * vdwioffsetptr2;
780 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
781 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
782 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
783 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
784 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
785 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
786 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
789 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
792 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
793 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
795 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
796 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
798 __m256d dummy_mask,cutoff_mask;
799 __m128 tmpmask0,tmpmask1;
800 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
801 __m256d one = _mm256_set1_pd(1.0);
802 __m256d two = _mm256_set1_pd(2.0);
808 jindex = nlist->jindex;
810 shiftidx = nlist->shift;
812 shiftvec = fr->shift_vec[0];
813 fshift = fr->fshift[0];
814 facel = _mm256_set1_pd(fr->epsfac);
815 charge = mdatoms->chargeA;
816 nvdwtype = fr->ntype;
818 vdwtype = mdatoms->typeA;
820 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
821 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
822 beta2 = _mm256_mul_pd(beta,beta);
823 beta3 = _mm256_mul_pd(beta,beta2);
825 ewtab = fr->ic->tabq_coul_F;
826 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
827 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
829 /* Setup water-specific parameters */
830 inr = nlist->iinr[0];
831 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
832 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
833 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
834 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
836 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
837 rcutoff_scalar = fr->rcoulomb;
838 rcutoff = _mm256_set1_pd(rcutoff_scalar);
839 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
841 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
842 rvdw = _mm256_set1_pd(fr->rvdw);
844 /* Avoid stupid compiler warnings */
845 jnrA = jnrB = jnrC = jnrD = 0;
854 for(iidx=0;iidx<4*DIM;iidx++)
859 /* Start outer loop over neighborlists */
860 for(iidx=0; iidx<nri; iidx++)
862 /* Load shift vector for this list */
863 i_shift_offset = DIM*shiftidx[iidx];
865 /* Load limits for loop over neighbors */
866 j_index_start = jindex[iidx];
867 j_index_end = jindex[iidx+1];
869 /* Get outer coordinate index */
871 i_coord_offset = DIM*inr;
873 /* Load i particle coords and add shift vector */
874 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
875 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
877 fix0 = _mm256_setzero_pd();
878 fiy0 = _mm256_setzero_pd();
879 fiz0 = _mm256_setzero_pd();
880 fix1 = _mm256_setzero_pd();
881 fiy1 = _mm256_setzero_pd();
882 fiz1 = _mm256_setzero_pd();
883 fix2 = _mm256_setzero_pd();
884 fiy2 = _mm256_setzero_pd();
885 fiz2 = _mm256_setzero_pd();
887 /* Start inner kernel loop */
888 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
891 /* Get j neighbor index, and coordinate index */
896 j_coord_offsetA = DIM*jnrA;
897 j_coord_offsetB = DIM*jnrB;
898 j_coord_offsetC = DIM*jnrC;
899 j_coord_offsetD = DIM*jnrD;
901 /* load j atom coordinates */
902 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
903 x+j_coord_offsetC,x+j_coord_offsetD,
906 /* Calculate displacement vector */
907 dx00 = _mm256_sub_pd(ix0,jx0);
908 dy00 = _mm256_sub_pd(iy0,jy0);
909 dz00 = _mm256_sub_pd(iz0,jz0);
910 dx10 = _mm256_sub_pd(ix1,jx0);
911 dy10 = _mm256_sub_pd(iy1,jy0);
912 dz10 = _mm256_sub_pd(iz1,jz0);
913 dx20 = _mm256_sub_pd(ix2,jx0);
914 dy20 = _mm256_sub_pd(iy2,jy0);
915 dz20 = _mm256_sub_pd(iz2,jz0);
917 /* Calculate squared distance and things based on it */
918 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
919 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
920 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
922 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
923 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
924 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
926 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
927 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
928 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
930 /* Load parameters for j particles */
931 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
932 charge+jnrC+0,charge+jnrD+0);
933 vdwjidx0A = 2*vdwtype[jnrA+0];
934 vdwjidx0B = 2*vdwtype[jnrB+0];
935 vdwjidx0C = 2*vdwtype[jnrC+0];
936 vdwjidx0D = 2*vdwtype[jnrD+0];
938 fjx0 = _mm256_setzero_pd();
939 fjy0 = _mm256_setzero_pd();
940 fjz0 = _mm256_setzero_pd();
942 /**************************
943 * CALCULATE INTERACTIONS *
944 **************************/
946 if (gmx_mm256_any_lt(rsq00,rcutoff2))
949 r00 = _mm256_mul_pd(rsq00,rinv00);
951 /* Compute parameters for interactions between i and j atoms */
952 qq00 = _mm256_mul_pd(iq0,jq0);
953 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
954 vdwioffsetptr0+vdwjidx0B,
955 vdwioffsetptr0+vdwjidx0C,
956 vdwioffsetptr0+vdwjidx0D,
959 /* EWALD ELECTROSTATICS */
961 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
962 ewrt = _mm256_mul_pd(r00,ewtabscale);
963 ewitab = _mm256_cvttpd_epi32(ewrt);
964 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
965 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
966 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
968 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
969 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
971 /* LENNARD-JONES DISPERSION/REPULSION */
973 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
974 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
976 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
978 fscal = _mm256_add_pd(felec,fvdw);
980 fscal = _mm256_and_pd(fscal,cutoff_mask);
982 /* Calculate temporary vectorial force */
983 tx = _mm256_mul_pd(fscal,dx00);
984 ty = _mm256_mul_pd(fscal,dy00);
985 tz = _mm256_mul_pd(fscal,dz00);
987 /* Update vectorial force */
988 fix0 = _mm256_add_pd(fix0,tx);
989 fiy0 = _mm256_add_pd(fiy0,ty);
990 fiz0 = _mm256_add_pd(fiz0,tz);
992 fjx0 = _mm256_add_pd(fjx0,tx);
993 fjy0 = _mm256_add_pd(fjy0,ty);
994 fjz0 = _mm256_add_pd(fjz0,tz);
998 /**************************
999 * CALCULATE INTERACTIONS *
1000 **************************/
1002 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1005 r10 = _mm256_mul_pd(rsq10,rinv10);
1007 /* Compute parameters for interactions between i and j atoms */
1008 qq10 = _mm256_mul_pd(iq1,jq0);
1010 /* EWALD ELECTROSTATICS */
1012 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1013 ewrt = _mm256_mul_pd(r10,ewtabscale);
1014 ewitab = _mm256_cvttpd_epi32(ewrt);
1015 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1016 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1017 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1019 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1020 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1022 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1026 fscal = _mm256_and_pd(fscal,cutoff_mask);
1028 /* Calculate temporary vectorial force */
1029 tx = _mm256_mul_pd(fscal,dx10);
1030 ty = _mm256_mul_pd(fscal,dy10);
1031 tz = _mm256_mul_pd(fscal,dz10);
1033 /* Update vectorial force */
1034 fix1 = _mm256_add_pd(fix1,tx);
1035 fiy1 = _mm256_add_pd(fiy1,ty);
1036 fiz1 = _mm256_add_pd(fiz1,tz);
1038 fjx0 = _mm256_add_pd(fjx0,tx);
1039 fjy0 = _mm256_add_pd(fjy0,ty);
1040 fjz0 = _mm256_add_pd(fjz0,tz);
1044 /**************************
1045 * CALCULATE INTERACTIONS *
1046 **************************/
1048 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1051 r20 = _mm256_mul_pd(rsq20,rinv20);
1053 /* Compute parameters for interactions between i and j atoms */
1054 qq20 = _mm256_mul_pd(iq2,jq0);
1056 /* EWALD ELECTROSTATICS */
1058 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1059 ewrt = _mm256_mul_pd(r20,ewtabscale);
1060 ewitab = _mm256_cvttpd_epi32(ewrt);
1061 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1062 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1063 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1065 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1066 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1068 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1072 fscal = _mm256_and_pd(fscal,cutoff_mask);
1074 /* Calculate temporary vectorial force */
1075 tx = _mm256_mul_pd(fscal,dx20);
1076 ty = _mm256_mul_pd(fscal,dy20);
1077 tz = _mm256_mul_pd(fscal,dz20);
1079 /* Update vectorial force */
1080 fix2 = _mm256_add_pd(fix2,tx);
1081 fiy2 = _mm256_add_pd(fiy2,ty);
1082 fiz2 = _mm256_add_pd(fiz2,tz);
1084 fjx0 = _mm256_add_pd(fjx0,tx);
1085 fjy0 = _mm256_add_pd(fjy0,ty);
1086 fjz0 = _mm256_add_pd(fjz0,tz);
1090 fjptrA = f+j_coord_offsetA;
1091 fjptrB = f+j_coord_offsetB;
1092 fjptrC = f+j_coord_offsetC;
1093 fjptrD = f+j_coord_offsetD;
1095 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1097 /* Inner loop uses 127 flops */
1100 if(jidx<j_index_end)
1103 /* Get j neighbor index, and coordinate index */
1104 jnrlistA = jjnr[jidx];
1105 jnrlistB = jjnr[jidx+1];
1106 jnrlistC = jjnr[jidx+2];
1107 jnrlistD = jjnr[jidx+3];
1108 /* Sign of each element will be negative for non-real atoms.
1109 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1110 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1112 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1114 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1115 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1116 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1118 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1119 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1120 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1121 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1122 j_coord_offsetA = DIM*jnrA;
1123 j_coord_offsetB = DIM*jnrB;
1124 j_coord_offsetC = DIM*jnrC;
1125 j_coord_offsetD = DIM*jnrD;
1127 /* load j atom coordinates */
1128 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1129 x+j_coord_offsetC,x+j_coord_offsetD,
1132 /* Calculate displacement vector */
1133 dx00 = _mm256_sub_pd(ix0,jx0);
1134 dy00 = _mm256_sub_pd(iy0,jy0);
1135 dz00 = _mm256_sub_pd(iz0,jz0);
1136 dx10 = _mm256_sub_pd(ix1,jx0);
1137 dy10 = _mm256_sub_pd(iy1,jy0);
1138 dz10 = _mm256_sub_pd(iz1,jz0);
1139 dx20 = _mm256_sub_pd(ix2,jx0);
1140 dy20 = _mm256_sub_pd(iy2,jy0);
1141 dz20 = _mm256_sub_pd(iz2,jz0);
1143 /* Calculate squared distance and things based on it */
1144 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1145 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1146 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1148 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1149 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1150 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1152 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1153 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1154 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1156 /* Load parameters for j particles */
1157 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1158 charge+jnrC+0,charge+jnrD+0);
1159 vdwjidx0A = 2*vdwtype[jnrA+0];
1160 vdwjidx0B = 2*vdwtype[jnrB+0];
1161 vdwjidx0C = 2*vdwtype[jnrC+0];
1162 vdwjidx0D = 2*vdwtype[jnrD+0];
1164 fjx0 = _mm256_setzero_pd();
1165 fjy0 = _mm256_setzero_pd();
1166 fjz0 = _mm256_setzero_pd();
1168 /**************************
1169 * CALCULATE INTERACTIONS *
1170 **************************/
1172 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1175 r00 = _mm256_mul_pd(rsq00,rinv00);
1176 r00 = _mm256_andnot_pd(dummy_mask,r00);
1178 /* Compute parameters for interactions between i and j atoms */
1179 qq00 = _mm256_mul_pd(iq0,jq0);
1180 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1181 vdwioffsetptr0+vdwjidx0B,
1182 vdwioffsetptr0+vdwjidx0C,
1183 vdwioffsetptr0+vdwjidx0D,
1186 /* EWALD ELECTROSTATICS */
1188 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1189 ewrt = _mm256_mul_pd(r00,ewtabscale);
1190 ewitab = _mm256_cvttpd_epi32(ewrt);
1191 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1192 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1193 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1195 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1196 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1198 /* LENNARD-JONES DISPERSION/REPULSION */
1200 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1201 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1203 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1205 fscal = _mm256_add_pd(felec,fvdw);
1207 fscal = _mm256_and_pd(fscal,cutoff_mask);
1209 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1211 /* Calculate temporary vectorial force */
1212 tx = _mm256_mul_pd(fscal,dx00);
1213 ty = _mm256_mul_pd(fscal,dy00);
1214 tz = _mm256_mul_pd(fscal,dz00);
1216 /* Update vectorial force */
1217 fix0 = _mm256_add_pd(fix0,tx);
1218 fiy0 = _mm256_add_pd(fiy0,ty);
1219 fiz0 = _mm256_add_pd(fiz0,tz);
1221 fjx0 = _mm256_add_pd(fjx0,tx);
1222 fjy0 = _mm256_add_pd(fjy0,ty);
1223 fjz0 = _mm256_add_pd(fjz0,tz);
1227 /**************************
1228 * CALCULATE INTERACTIONS *
1229 **************************/
1231 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1234 r10 = _mm256_mul_pd(rsq10,rinv10);
1235 r10 = _mm256_andnot_pd(dummy_mask,r10);
1237 /* Compute parameters for interactions between i and j atoms */
1238 qq10 = _mm256_mul_pd(iq1,jq0);
1240 /* EWALD ELECTROSTATICS */
1242 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1243 ewrt = _mm256_mul_pd(r10,ewtabscale);
1244 ewitab = _mm256_cvttpd_epi32(ewrt);
1245 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1246 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1247 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1249 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1250 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1252 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1256 fscal = _mm256_and_pd(fscal,cutoff_mask);
1258 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1260 /* Calculate temporary vectorial force */
1261 tx = _mm256_mul_pd(fscal,dx10);
1262 ty = _mm256_mul_pd(fscal,dy10);
1263 tz = _mm256_mul_pd(fscal,dz10);
1265 /* Update vectorial force */
1266 fix1 = _mm256_add_pd(fix1,tx);
1267 fiy1 = _mm256_add_pd(fiy1,ty);
1268 fiz1 = _mm256_add_pd(fiz1,tz);
1270 fjx0 = _mm256_add_pd(fjx0,tx);
1271 fjy0 = _mm256_add_pd(fjy0,ty);
1272 fjz0 = _mm256_add_pd(fjz0,tz);
1276 /**************************
1277 * CALCULATE INTERACTIONS *
1278 **************************/
1280 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1283 r20 = _mm256_mul_pd(rsq20,rinv20);
1284 r20 = _mm256_andnot_pd(dummy_mask,r20);
1286 /* Compute parameters for interactions between i and j atoms */
1287 qq20 = _mm256_mul_pd(iq2,jq0);
1289 /* EWALD ELECTROSTATICS */
1291 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1292 ewrt = _mm256_mul_pd(r20,ewtabscale);
1293 ewitab = _mm256_cvttpd_epi32(ewrt);
1294 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1295 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1296 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1298 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1299 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1301 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1305 fscal = _mm256_and_pd(fscal,cutoff_mask);
1307 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1309 /* Calculate temporary vectorial force */
1310 tx = _mm256_mul_pd(fscal,dx20);
1311 ty = _mm256_mul_pd(fscal,dy20);
1312 tz = _mm256_mul_pd(fscal,dz20);
1314 /* Update vectorial force */
1315 fix2 = _mm256_add_pd(fix2,tx);
1316 fiy2 = _mm256_add_pd(fiy2,ty);
1317 fiz2 = _mm256_add_pd(fiz2,tz);
1319 fjx0 = _mm256_add_pd(fjx0,tx);
1320 fjy0 = _mm256_add_pd(fjy0,ty);
1321 fjz0 = _mm256_add_pd(fjz0,tz);
1325 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1326 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1327 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1328 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1330 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1332 /* Inner loop uses 130 flops */
1335 /* End of innermost loop */
1337 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1338 f+i_coord_offset,fshift+i_shift_offset);
1340 /* Increment number of inner iterations */
1341 inneriter += j_index_end - j_index_start;
1343 /* Outer loop uses 18 flops */
1346 /* Increment number of outer iterations */
1349 /* Update outer/inner flops */
1351 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*130);