2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
42 #include "../nb_kernel.h"
43 #include "types/simple.h"
44 #include "gromacs/math/vec.h"
47 #include "gromacs/simd/math_x86_avx_256_double.h"
48 #include "kernelutil_x86_avx_256_double.h"
51 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_256_double
52 * Electrostatics interaction: Ewald
53 * VdW interaction: CubicSplineTable
54 * Geometry: Water3-Particle
55 * Calculate force/pot: PotentialAndForce
58 nb_kernel_ElecEw_VdwCSTab_GeomW3P1_VF_avx_256_double
59 (t_nblist * gmx_restrict nlist,
60 rvec * gmx_restrict xx,
61 rvec * gmx_restrict ff,
62 t_forcerec * gmx_restrict fr,
63 t_mdatoms * gmx_restrict mdatoms,
64 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
65 t_nrnb * gmx_restrict nrnb)
67 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
68 * just 0 for non-waters.
69 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
70 * jnr indices corresponding to data put in the four positions in the SIMD register.
72 int i_shift_offset,i_coord_offset,outeriter,inneriter;
73 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int jnrA,jnrB,jnrC,jnrD;
75 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
76 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
77 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
78 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
80 real *shiftvec,*fshift,*x,*f;
81 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
83 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
84 real * vdwioffsetptr0;
85 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
86 real * vdwioffsetptr1;
87 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
88 real * vdwioffsetptr2;
89 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
90 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
91 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
92 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
93 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
94 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
95 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
98 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
101 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
102 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
104 __m128i ifour = _mm_set1_epi32(4);
105 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
108 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
109 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
111 __m256d dummy_mask,cutoff_mask;
112 __m128 tmpmask0,tmpmask1;
113 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
114 __m256d one = _mm256_set1_pd(1.0);
115 __m256d two = _mm256_set1_pd(2.0);
121 jindex = nlist->jindex;
123 shiftidx = nlist->shift;
125 shiftvec = fr->shift_vec[0];
126 fshift = fr->fshift[0];
127 facel = _mm256_set1_pd(fr->epsfac);
128 charge = mdatoms->chargeA;
129 nvdwtype = fr->ntype;
131 vdwtype = mdatoms->typeA;
133 vftab = kernel_data->table_vdw->data;
134 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
136 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
137 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
138 beta2 = _mm256_mul_pd(beta,beta);
139 beta3 = _mm256_mul_pd(beta,beta2);
141 ewtab = fr->ic->tabq_coul_FDV0;
142 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
143 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
145 /* Setup water-specific parameters */
146 inr = nlist->iinr[0];
147 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
148 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
149 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
150 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
152 /* Avoid stupid compiler warnings */
153 jnrA = jnrB = jnrC = jnrD = 0;
162 for(iidx=0;iidx<4*DIM;iidx++)
167 /* Start outer loop over neighborlists */
168 for(iidx=0; iidx<nri; iidx++)
170 /* Load shift vector for this list */
171 i_shift_offset = DIM*shiftidx[iidx];
173 /* Load limits for loop over neighbors */
174 j_index_start = jindex[iidx];
175 j_index_end = jindex[iidx+1];
177 /* Get outer coordinate index */
179 i_coord_offset = DIM*inr;
181 /* Load i particle coords and add shift vector */
182 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
183 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
185 fix0 = _mm256_setzero_pd();
186 fiy0 = _mm256_setzero_pd();
187 fiz0 = _mm256_setzero_pd();
188 fix1 = _mm256_setzero_pd();
189 fiy1 = _mm256_setzero_pd();
190 fiz1 = _mm256_setzero_pd();
191 fix2 = _mm256_setzero_pd();
192 fiy2 = _mm256_setzero_pd();
193 fiz2 = _mm256_setzero_pd();
195 /* Reset potential sums */
196 velecsum = _mm256_setzero_pd();
197 vvdwsum = _mm256_setzero_pd();
199 /* Start inner kernel loop */
200 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
203 /* Get j neighbor index, and coordinate index */
208 j_coord_offsetA = DIM*jnrA;
209 j_coord_offsetB = DIM*jnrB;
210 j_coord_offsetC = DIM*jnrC;
211 j_coord_offsetD = DIM*jnrD;
213 /* load j atom coordinates */
214 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
215 x+j_coord_offsetC,x+j_coord_offsetD,
218 /* Calculate displacement vector */
219 dx00 = _mm256_sub_pd(ix0,jx0);
220 dy00 = _mm256_sub_pd(iy0,jy0);
221 dz00 = _mm256_sub_pd(iz0,jz0);
222 dx10 = _mm256_sub_pd(ix1,jx0);
223 dy10 = _mm256_sub_pd(iy1,jy0);
224 dz10 = _mm256_sub_pd(iz1,jz0);
225 dx20 = _mm256_sub_pd(ix2,jx0);
226 dy20 = _mm256_sub_pd(iy2,jy0);
227 dz20 = _mm256_sub_pd(iz2,jz0);
229 /* Calculate squared distance and things based on it */
230 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
231 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
232 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
234 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
235 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
236 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
238 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
239 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
240 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
242 /* Load parameters for j particles */
243 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
244 charge+jnrC+0,charge+jnrD+0);
245 vdwjidx0A = 2*vdwtype[jnrA+0];
246 vdwjidx0B = 2*vdwtype[jnrB+0];
247 vdwjidx0C = 2*vdwtype[jnrC+0];
248 vdwjidx0D = 2*vdwtype[jnrD+0];
250 fjx0 = _mm256_setzero_pd();
251 fjy0 = _mm256_setzero_pd();
252 fjz0 = _mm256_setzero_pd();
254 /**************************
255 * CALCULATE INTERACTIONS *
256 **************************/
258 r00 = _mm256_mul_pd(rsq00,rinv00);
260 /* Compute parameters for interactions between i and j atoms */
261 qq00 = _mm256_mul_pd(iq0,jq0);
262 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
263 vdwioffsetptr0+vdwjidx0B,
264 vdwioffsetptr0+vdwjidx0C,
265 vdwioffsetptr0+vdwjidx0D,
268 /* Calculate table index by multiplying r with table scale and truncate to integer */
269 rt = _mm256_mul_pd(r00,vftabscale);
270 vfitab = _mm256_cvttpd_epi32(rt);
271 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
272 vfitab = _mm_slli_epi32(vfitab,3);
274 /* EWALD ELECTROSTATICS */
276 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
277 ewrt = _mm256_mul_pd(r00,ewtabscale);
278 ewitab = _mm256_cvttpd_epi32(ewrt);
279 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
280 ewitab = _mm_slli_epi32(ewitab,2);
281 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
282 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
283 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
284 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
285 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
286 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
287 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
288 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
289 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
291 /* CUBIC SPLINE TABLE DISPERSION */
292 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
293 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
294 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
295 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
296 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
297 Heps = _mm256_mul_pd(vfeps,H);
298 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
299 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
300 vvdw6 = _mm256_mul_pd(c6_00,VV);
301 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
302 fvdw6 = _mm256_mul_pd(c6_00,FF);
304 /* CUBIC SPLINE TABLE REPULSION */
305 vfitab = _mm_add_epi32(vfitab,ifour);
306 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
307 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
308 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
309 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
310 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
311 Heps = _mm256_mul_pd(vfeps,H);
312 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
313 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
314 vvdw12 = _mm256_mul_pd(c12_00,VV);
315 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
316 fvdw12 = _mm256_mul_pd(c12_00,FF);
317 vvdw = _mm256_add_pd(vvdw12,vvdw6);
318 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
320 /* Update potential sum for this i atom from the interaction with this j atom. */
321 velecsum = _mm256_add_pd(velecsum,velec);
322 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
324 fscal = _mm256_add_pd(felec,fvdw);
326 /* Calculate temporary vectorial force */
327 tx = _mm256_mul_pd(fscal,dx00);
328 ty = _mm256_mul_pd(fscal,dy00);
329 tz = _mm256_mul_pd(fscal,dz00);
331 /* Update vectorial force */
332 fix0 = _mm256_add_pd(fix0,tx);
333 fiy0 = _mm256_add_pd(fiy0,ty);
334 fiz0 = _mm256_add_pd(fiz0,tz);
336 fjx0 = _mm256_add_pd(fjx0,tx);
337 fjy0 = _mm256_add_pd(fjy0,ty);
338 fjz0 = _mm256_add_pd(fjz0,tz);
340 /**************************
341 * CALCULATE INTERACTIONS *
342 **************************/
344 r10 = _mm256_mul_pd(rsq10,rinv10);
346 /* Compute parameters for interactions between i and j atoms */
347 qq10 = _mm256_mul_pd(iq1,jq0);
349 /* EWALD ELECTROSTATICS */
351 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
352 ewrt = _mm256_mul_pd(r10,ewtabscale);
353 ewitab = _mm256_cvttpd_epi32(ewrt);
354 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
355 ewitab = _mm_slli_epi32(ewitab,2);
356 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
357 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
358 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
359 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
360 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
361 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
362 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
363 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
364 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
366 /* Update potential sum for this i atom from the interaction with this j atom. */
367 velecsum = _mm256_add_pd(velecsum,velec);
371 /* Calculate temporary vectorial force */
372 tx = _mm256_mul_pd(fscal,dx10);
373 ty = _mm256_mul_pd(fscal,dy10);
374 tz = _mm256_mul_pd(fscal,dz10);
376 /* Update vectorial force */
377 fix1 = _mm256_add_pd(fix1,tx);
378 fiy1 = _mm256_add_pd(fiy1,ty);
379 fiz1 = _mm256_add_pd(fiz1,tz);
381 fjx0 = _mm256_add_pd(fjx0,tx);
382 fjy0 = _mm256_add_pd(fjy0,ty);
383 fjz0 = _mm256_add_pd(fjz0,tz);
385 /**************************
386 * CALCULATE INTERACTIONS *
387 **************************/
389 r20 = _mm256_mul_pd(rsq20,rinv20);
391 /* Compute parameters for interactions between i and j atoms */
392 qq20 = _mm256_mul_pd(iq2,jq0);
394 /* EWALD ELECTROSTATICS */
396 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
397 ewrt = _mm256_mul_pd(r20,ewtabscale);
398 ewitab = _mm256_cvttpd_epi32(ewrt);
399 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
400 ewitab = _mm_slli_epi32(ewitab,2);
401 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
402 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
403 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
404 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
405 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
406 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
407 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
408 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
409 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
411 /* Update potential sum for this i atom from the interaction with this j atom. */
412 velecsum = _mm256_add_pd(velecsum,velec);
416 /* Calculate temporary vectorial force */
417 tx = _mm256_mul_pd(fscal,dx20);
418 ty = _mm256_mul_pd(fscal,dy20);
419 tz = _mm256_mul_pd(fscal,dz20);
421 /* Update vectorial force */
422 fix2 = _mm256_add_pd(fix2,tx);
423 fiy2 = _mm256_add_pd(fiy2,ty);
424 fiz2 = _mm256_add_pd(fiz2,tz);
426 fjx0 = _mm256_add_pd(fjx0,tx);
427 fjy0 = _mm256_add_pd(fjy0,ty);
428 fjz0 = _mm256_add_pd(fjz0,tz);
430 fjptrA = f+j_coord_offsetA;
431 fjptrB = f+j_coord_offsetB;
432 fjptrC = f+j_coord_offsetC;
433 fjptrD = f+j_coord_offsetD;
435 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
437 /* Inner loop uses 160 flops */
443 /* Get j neighbor index, and coordinate index */
444 jnrlistA = jjnr[jidx];
445 jnrlistB = jjnr[jidx+1];
446 jnrlistC = jjnr[jidx+2];
447 jnrlistD = jjnr[jidx+3];
448 /* Sign of each element will be negative for non-real atoms.
449 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
450 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
452 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
454 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
455 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
456 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
458 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
459 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
460 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
461 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
462 j_coord_offsetA = DIM*jnrA;
463 j_coord_offsetB = DIM*jnrB;
464 j_coord_offsetC = DIM*jnrC;
465 j_coord_offsetD = DIM*jnrD;
467 /* load j atom coordinates */
468 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
469 x+j_coord_offsetC,x+j_coord_offsetD,
472 /* Calculate displacement vector */
473 dx00 = _mm256_sub_pd(ix0,jx0);
474 dy00 = _mm256_sub_pd(iy0,jy0);
475 dz00 = _mm256_sub_pd(iz0,jz0);
476 dx10 = _mm256_sub_pd(ix1,jx0);
477 dy10 = _mm256_sub_pd(iy1,jy0);
478 dz10 = _mm256_sub_pd(iz1,jz0);
479 dx20 = _mm256_sub_pd(ix2,jx0);
480 dy20 = _mm256_sub_pd(iy2,jy0);
481 dz20 = _mm256_sub_pd(iz2,jz0);
483 /* Calculate squared distance and things based on it */
484 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
485 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
486 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
488 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
489 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
490 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
492 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
493 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
494 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
496 /* Load parameters for j particles */
497 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
498 charge+jnrC+0,charge+jnrD+0);
499 vdwjidx0A = 2*vdwtype[jnrA+0];
500 vdwjidx0B = 2*vdwtype[jnrB+0];
501 vdwjidx0C = 2*vdwtype[jnrC+0];
502 vdwjidx0D = 2*vdwtype[jnrD+0];
504 fjx0 = _mm256_setzero_pd();
505 fjy0 = _mm256_setzero_pd();
506 fjz0 = _mm256_setzero_pd();
508 /**************************
509 * CALCULATE INTERACTIONS *
510 **************************/
512 r00 = _mm256_mul_pd(rsq00,rinv00);
513 r00 = _mm256_andnot_pd(dummy_mask,r00);
515 /* Compute parameters for interactions between i and j atoms */
516 qq00 = _mm256_mul_pd(iq0,jq0);
517 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
518 vdwioffsetptr0+vdwjidx0B,
519 vdwioffsetptr0+vdwjidx0C,
520 vdwioffsetptr0+vdwjidx0D,
523 /* Calculate table index by multiplying r with table scale and truncate to integer */
524 rt = _mm256_mul_pd(r00,vftabscale);
525 vfitab = _mm256_cvttpd_epi32(rt);
526 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
527 vfitab = _mm_slli_epi32(vfitab,3);
529 /* EWALD ELECTROSTATICS */
531 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
532 ewrt = _mm256_mul_pd(r00,ewtabscale);
533 ewitab = _mm256_cvttpd_epi32(ewrt);
534 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
535 ewitab = _mm_slli_epi32(ewitab,2);
536 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
537 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
538 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
539 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
540 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
541 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
542 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
543 velec = _mm256_mul_pd(qq00,_mm256_sub_pd(rinv00,velec));
544 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
546 /* CUBIC SPLINE TABLE DISPERSION */
547 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
548 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
549 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
550 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
551 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
552 Heps = _mm256_mul_pd(vfeps,H);
553 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
554 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
555 vvdw6 = _mm256_mul_pd(c6_00,VV);
556 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
557 fvdw6 = _mm256_mul_pd(c6_00,FF);
559 /* CUBIC SPLINE TABLE REPULSION */
560 vfitab = _mm_add_epi32(vfitab,ifour);
561 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
562 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
563 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
564 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
565 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
566 Heps = _mm256_mul_pd(vfeps,H);
567 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
568 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
569 vvdw12 = _mm256_mul_pd(c12_00,VV);
570 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
571 fvdw12 = _mm256_mul_pd(c12_00,FF);
572 vvdw = _mm256_add_pd(vvdw12,vvdw6);
573 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
575 /* Update potential sum for this i atom from the interaction with this j atom. */
576 velec = _mm256_andnot_pd(dummy_mask,velec);
577 velecsum = _mm256_add_pd(velecsum,velec);
578 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
579 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
581 fscal = _mm256_add_pd(felec,fvdw);
583 fscal = _mm256_andnot_pd(dummy_mask,fscal);
585 /* Calculate temporary vectorial force */
586 tx = _mm256_mul_pd(fscal,dx00);
587 ty = _mm256_mul_pd(fscal,dy00);
588 tz = _mm256_mul_pd(fscal,dz00);
590 /* Update vectorial force */
591 fix0 = _mm256_add_pd(fix0,tx);
592 fiy0 = _mm256_add_pd(fiy0,ty);
593 fiz0 = _mm256_add_pd(fiz0,tz);
595 fjx0 = _mm256_add_pd(fjx0,tx);
596 fjy0 = _mm256_add_pd(fjy0,ty);
597 fjz0 = _mm256_add_pd(fjz0,tz);
599 /**************************
600 * CALCULATE INTERACTIONS *
601 **************************/
603 r10 = _mm256_mul_pd(rsq10,rinv10);
604 r10 = _mm256_andnot_pd(dummy_mask,r10);
606 /* Compute parameters for interactions between i and j atoms */
607 qq10 = _mm256_mul_pd(iq1,jq0);
609 /* EWALD ELECTROSTATICS */
611 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
612 ewrt = _mm256_mul_pd(r10,ewtabscale);
613 ewitab = _mm256_cvttpd_epi32(ewrt);
614 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
615 ewitab = _mm_slli_epi32(ewitab,2);
616 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
617 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
618 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
619 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
620 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
621 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
622 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
623 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(rinv10,velec));
624 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
626 /* Update potential sum for this i atom from the interaction with this j atom. */
627 velec = _mm256_andnot_pd(dummy_mask,velec);
628 velecsum = _mm256_add_pd(velecsum,velec);
632 fscal = _mm256_andnot_pd(dummy_mask,fscal);
634 /* Calculate temporary vectorial force */
635 tx = _mm256_mul_pd(fscal,dx10);
636 ty = _mm256_mul_pd(fscal,dy10);
637 tz = _mm256_mul_pd(fscal,dz10);
639 /* Update vectorial force */
640 fix1 = _mm256_add_pd(fix1,tx);
641 fiy1 = _mm256_add_pd(fiy1,ty);
642 fiz1 = _mm256_add_pd(fiz1,tz);
644 fjx0 = _mm256_add_pd(fjx0,tx);
645 fjy0 = _mm256_add_pd(fjy0,ty);
646 fjz0 = _mm256_add_pd(fjz0,tz);
648 /**************************
649 * CALCULATE INTERACTIONS *
650 **************************/
652 r20 = _mm256_mul_pd(rsq20,rinv20);
653 r20 = _mm256_andnot_pd(dummy_mask,r20);
655 /* Compute parameters for interactions between i and j atoms */
656 qq20 = _mm256_mul_pd(iq2,jq0);
658 /* EWALD ELECTROSTATICS */
660 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
661 ewrt = _mm256_mul_pd(r20,ewtabscale);
662 ewitab = _mm256_cvttpd_epi32(ewrt);
663 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
664 ewitab = _mm_slli_epi32(ewitab,2);
665 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
666 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
667 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
668 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
669 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
670 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
671 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
672 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(rinv20,velec));
673 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
675 /* Update potential sum for this i atom from the interaction with this j atom. */
676 velec = _mm256_andnot_pd(dummy_mask,velec);
677 velecsum = _mm256_add_pd(velecsum,velec);
681 fscal = _mm256_andnot_pd(dummy_mask,fscal);
683 /* Calculate temporary vectorial force */
684 tx = _mm256_mul_pd(fscal,dx20);
685 ty = _mm256_mul_pd(fscal,dy20);
686 tz = _mm256_mul_pd(fscal,dz20);
688 /* Update vectorial force */
689 fix2 = _mm256_add_pd(fix2,tx);
690 fiy2 = _mm256_add_pd(fiy2,ty);
691 fiz2 = _mm256_add_pd(fiz2,tz);
693 fjx0 = _mm256_add_pd(fjx0,tx);
694 fjy0 = _mm256_add_pd(fjy0,ty);
695 fjz0 = _mm256_add_pd(fjz0,tz);
697 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
698 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
699 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
700 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
702 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
704 /* Inner loop uses 163 flops */
707 /* End of innermost loop */
709 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
710 f+i_coord_offset,fshift+i_shift_offset);
713 /* Update potential energies */
714 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
715 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
717 /* Increment number of inner iterations */
718 inneriter += j_index_end - j_index_start;
720 /* Outer loop uses 20 flops */
723 /* Increment number of outer iterations */
726 /* Update outer/inner flops */
728 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*163);
731 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_256_double
732 * Electrostatics interaction: Ewald
733 * VdW interaction: CubicSplineTable
734 * Geometry: Water3-Particle
735 * Calculate force/pot: Force
738 nb_kernel_ElecEw_VdwCSTab_GeomW3P1_F_avx_256_double
739 (t_nblist * gmx_restrict nlist,
740 rvec * gmx_restrict xx,
741 rvec * gmx_restrict ff,
742 t_forcerec * gmx_restrict fr,
743 t_mdatoms * gmx_restrict mdatoms,
744 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
745 t_nrnb * gmx_restrict nrnb)
747 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
748 * just 0 for non-waters.
749 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
750 * jnr indices corresponding to data put in the four positions in the SIMD register.
752 int i_shift_offset,i_coord_offset,outeriter,inneriter;
753 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
754 int jnrA,jnrB,jnrC,jnrD;
755 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
756 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
757 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
758 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
760 real *shiftvec,*fshift,*x,*f;
761 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
763 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
764 real * vdwioffsetptr0;
765 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
766 real * vdwioffsetptr1;
767 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
768 real * vdwioffsetptr2;
769 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
770 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
771 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
772 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
773 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
774 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
775 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
778 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
781 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
782 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
784 __m128i ifour = _mm_set1_epi32(4);
785 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
788 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
789 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
791 __m256d dummy_mask,cutoff_mask;
792 __m128 tmpmask0,tmpmask1;
793 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
794 __m256d one = _mm256_set1_pd(1.0);
795 __m256d two = _mm256_set1_pd(2.0);
801 jindex = nlist->jindex;
803 shiftidx = nlist->shift;
805 shiftvec = fr->shift_vec[0];
806 fshift = fr->fshift[0];
807 facel = _mm256_set1_pd(fr->epsfac);
808 charge = mdatoms->chargeA;
809 nvdwtype = fr->ntype;
811 vdwtype = mdatoms->typeA;
813 vftab = kernel_data->table_vdw->data;
814 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
816 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
817 beta = _mm256_set1_pd(fr->ic->ewaldcoeff_q);
818 beta2 = _mm256_mul_pd(beta,beta);
819 beta3 = _mm256_mul_pd(beta,beta2);
821 ewtab = fr->ic->tabq_coul_F;
822 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
823 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
825 /* Setup water-specific parameters */
826 inr = nlist->iinr[0];
827 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
828 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
829 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
830 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
832 /* Avoid stupid compiler warnings */
833 jnrA = jnrB = jnrC = jnrD = 0;
842 for(iidx=0;iidx<4*DIM;iidx++)
847 /* Start outer loop over neighborlists */
848 for(iidx=0; iidx<nri; iidx++)
850 /* Load shift vector for this list */
851 i_shift_offset = DIM*shiftidx[iidx];
853 /* Load limits for loop over neighbors */
854 j_index_start = jindex[iidx];
855 j_index_end = jindex[iidx+1];
857 /* Get outer coordinate index */
859 i_coord_offset = DIM*inr;
861 /* Load i particle coords and add shift vector */
862 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
863 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
865 fix0 = _mm256_setzero_pd();
866 fiy0 = _mm256_setzero_pd();
867 fiz0 = _mm256_setzero_pd();
868 fix1 = _mm256_setzero_pd();
869 fiy1 = _mm256_setzero_pd();
870 fiz1 = _mm256_setzero_pd();
871 fix2 = _mm256_setzero_pd();
872 fiy2 = _mm256_setzero_pd();
873 fiz2 = _mm256_setzero_pd();
875 /* Start inner kernel loop */
876 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
879 /* Get j neighbor index, and coordinate index */
884 j_coord_offsetA = DIM*jnrA;
885 j_coord_offsetB = DIM*jnrB;
886 j_coord_offsetC = DIM*jnrC;
887 j_coord_offsetD = DIM*jnrD;
889 /* load j atom coordinates */
890 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
891 x+j_coord_offsetC,x+j_coord_offsetD,
894 /* Calculate displacement vector */
895 dx00 = _mm256_sub_pd(ix0,jx0);
896 dy00 = _mm256_sub_pd(iy0,jy0);
897 dz00 = _mm256_sub_pd(iz0,jz0);
898 dx10 = _mm256_sub_pd(ix1,jx0);
899 dy10 = _mm256_sub_pd(iy1,jy0);
900 dz10 = _mm256_sub_pd(iz1,jz0);
901 dx20 = _mm256_sub_pd(ix2,jx0);
902 dy20 = _mm256_sub_pd(iy2,jy0);
903 dz20 = _mm256_sub_pd(iz2,jz0);
905 /* Calculate squared distance and things based on it */
906 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
907 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
908 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
910 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
911 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
912 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
914 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
915 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
916 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
918 /* Load parameters for j particles */
919 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
920 charge+jnrC+0,charge+jnrD+0);
921 vdwjidx0A = 2*vdwtype[jnrA+0];
922 vdwjidx0B = 2*vdwtype[jnrB+0];
923 vdwjidx0C = 2*vdwtype[jnrC+0];
924 vdwjidx0D = 2*vdwtype[jnrD+0];
926 fjx0 = _mm256_setzero_pd();
927 fjy0 = _mm256_setzero_pd();
928 fjz0 = _mm256_setzero_pd();
930 /**************************
931 * CALCULATE INTERACTIONS *
932 **************************/
934 r00 = _mm256_mul_pd(rsq00,rinv00);
936 /* Compute parameters for interactions between i and j atoms */
937 qq00 = _mm256_mul_pd(iq0,jq0);
938 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
939 vdwioffsetptr0+vdwjidx0B,
940 vdwioffsetptr0+vdwjidx0C,
941 vdwioffsetptr0+vdwjidx0D,
944 /* Calculate table index by multiplying r with table scale and truncate to integer */
945 rt = _mm256_mul_pd(r00,vftabscale);
946 vfitab = _mm256_cvttpd_epi32(rt);
947 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
948 vfitab = _mm_slli_epi32(vfitab,3);
950 /* EWALD ELECTROSTATICS */
952 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
953 ewrt = _mm256_mul_pd(r00,ewtabscale);
954 ewitab = _mm256_cvttpd_epi32(ewrt);
955 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
956 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
957 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
959 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
960 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
962 /* CUBIC SPLINE TABLE DISPERSION */
963 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
964 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
965 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
966 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
967 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
968 Heps = _mm256_mul_pd(vfeps,H);
969 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
970 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
971 fvdw6 = _mm256_mul_pd(c6_00,FF);
973 /* CUBIC SPLINE TABLE REPULSION */
974 vfitab = _mm_add_epi32(vfitab,ifour);
975 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
976 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
977 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
978 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
979 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
980 Heps = _mm256_mul_pd(vfeps,H);
981 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
982 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
983 fvdw12 = _mm256_mul_pd(c12_00,FF);
984 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
986 fscal = _mm256_add_pd(felec,fvdw);
988 /* Calculate temporary vectorial force */
989 tx = _mm256_mul_pd(fscal,dx00);
990 ty = _mm256_mul_pd(fscal,dy00);
991 tz = _mm256_mul_pd(fscal,dz00);
993 /* Update vectorial force */
994 fix0 = _mm256_add_pd(fix0,tx);
995 fiy0 = _mm256_add_pd(fiy0,ty);
996 fiz0 = _mm256_add_pd(fiz0,tz);
998 fjx0 = _mm256_add_pd(fjx0,tx);
999 fjy0 = _mm256_add_pd(fjy0,ty);
1000 fjz0 = _mm256_add_pd(fjz0,tz);
1002 /**************************
1003 * CALCULATE INTERACTIONS *
1004 **************************/
1006 r10 = _mm256_mul_pd(rsq10,rinv10);
1008 /* Compute parameters for interactions between i and j atoms */
1009 qq10 = _mm256_mul_pd(iq1,jq0);
1011 /* EWALD ELECTROSTATICS */
1013 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1014 ewrt = _mm256_mul_pd(r10,ewtabscale);
1015 ewitab = _mm256_cvttpd_epi32(ewrt);
1016 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1017 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1018 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1020 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1021 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1025 /* Calculate temporary vectorial force */
1026 tx = _mm256_mul_pd(fscal,dx10);
1027 ty = _mm256_mul_pd(fscal,dy10);
1028 tz = _mm256_mul_pd(fscal,dz10);
1030 /* Update vectorial force */
1031 fix1 = _mm256_add_pd(fix1,tx);
1032 fiy1 = _mm256_add_pd(fiy1,ty);
1033 fiz1 = _mm256_add_pd(fiz1,tz);
1035 fjx0 = _mm256_add_pd(fjx0,tx);
1036 fjy0 = _mm256_add_pd(fjy0,ty);
1037 fjz0 = _mm256_add_pd(fjz0,tz);
1039 /**************************
1040 * CALCULATE INTERACTIONS *
1041 **************************/
1043 r20 = _mm256_mul_pd(rsq20,rinv20);
1045 /* Compute parameters for interactions between i and j atoms */
1046 qq20 = _mm256_mul_pd(iq2,jq0);
1048 /* EWALD ELECTROSTATICS */
1050 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1051 ewrt = _mm256_mul_pd(r20,ewtabscale);
1052 ewitab = _mm256_cvttpd_epi32(ewrt);
1053 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1054 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1055 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1057 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1058 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1062 /* Calculate temporary vectorial force */
1063 tx = _mm256_mul_pd(fscal,dx20);
1064 ty = _mm256_mul_pd(fscal,dy20);
1065 tz = _mm256_mul_pd(fscal,dz20);
1067 /* Update vectorial force */
1068 fix2 = _mm256_add_pd(fix2,tx);
1069 fiy2 = _mm256_add_pd(fiy2,ty);
1070 fiz2 = _mm256_add_pd(fiz2,tz);
1072 fjx0 = _mm256_add_pd(fjx0,tx);
1073 fjy0 = _mm256_add_pd(fjy0,ty);
1074 fjz0 = _mm256_add_pd(fjz0,tz);
1076 fjptrA = f+j_coord_offsetA;
1077 fjptrB = f+j_coord_offsetB;
1078 fjptrC = f+j_coord_offsetC;
1079 fjptrD = f+j_coord_offsetD;
1081 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1083 /* Inner loop uses 137 flops */
1086 if(jidx<j_index_end)
1089 /* Get j neighbor index, and coordinate index */
1090 jnrlistA = jjnr[jidx];
1091 jnrlistB = jjnr[jidx+1];
1092 jnrlistC = jjnr[jidx+2];
1093 jnrlistD = jjnr[jidx+3];
1094 /* Sign of each element will be negative for non-real atoms.
1095 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1096 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1098 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1100 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1101 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1102 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1104 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1105 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1106 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1107 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1108 j_coord_offsetA = DIM*jnrA;
1109 j_coord_offsetB = DIM*jnrB;
1110 j_coord_offsetC = DIM*jnrC;
1111 j_coord_offsetD = DIM*jnrD;
1113 /* load j atom coordinates */
1114 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1115 x+j_coord_offsetC,x+j_coord_offsetD,
1118 /* Calculate displacement vector */
1119 dx00 = _mm256_sub_pd(ix0,jx0);
1120 dy00 = _mm256_sub_pd(iy0,jy0);
1121 dz00 = _mm256_sub_pd(iz0,jz0);
1122 dx10 = _mm256_sub_pd(ix1,jx0);
1123 dy10 = _mm256_sub_pd(iy1,jy0);
1124 dz10 = _mm256_sub_pd(iz1,jz0);
1125 dx20 = _mm256_sub_pd(ix2,jx0);
1126 dy20 = _mm256_sub_pd(iy2,jy0);
1127 dz20 = _mm256_sub_pd(iz2,jz0);
1129 /* Calculate squared distance and things based on it */
1130 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1131 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1132 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1134 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1135 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1136 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1138 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1139 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1140 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1142 /* Load parameters for j particles */
1143 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1144 charge+jnrC+0,charge+jnrD+0);
1145 vdwjidx0A = 2*vdwtype[jnrA+0];
1146 vdwjidx0B = 2*vdwtype[jnrB+0];
1147 vdwjidx0C = 2*vdwtype[jnrC+0];
1148 vdwjidx0D = 2*vdwtype[jnrD+0];
1150 fjx0 = _mm256_setzero_pd();
1151 fjy0 = _mm256_setzero_pd();
1152 fjz0 = _mm256_setzero_pd();
1154 /**************************
1155 * CALCULATE INTERACTIONS *
1156 **************************/
1158 r00 = _mm256_mul_pd(rsq00,rinv00);
1159 r00 = _mm256_andnot_pd(dummy_mask,r00);
1161 /* Compute parameters for interactions between i and j atoms */
1162 qq00 = _mm256_mul_pd(iq0,jq0);
1163 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1164 vdwioffsetptr0+vdwjidx0B,
1165 vdwioffsetptr0+vdwjidx0C,
1166 vdwioffsetptr0+vdwjidx0D,
1169 /* Calculate table index by multiplying r with table scale and truncate to integer */
1170 rt = _mm256_mul_pd(r00,vftabscale);
1171 vfitab = _mm256_cvttpd_epi32(rt);
1172 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1173 vfitab = _mm_slli_epi32(vfitab,3);
1175 /* EWALD ELECTROSTATICS */
1177 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1178 ewrt = _mm256_mul_pd(r00,ewtabscale);
1179 ewitab = _mm256_cvttpd_epi32(ewrt);
1180 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1181 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1182 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1184 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1185 felec = _mm256_mul_pd(_mm256_mul_pd(qq00,rinv00),_mm256_sub_pd(rinvsq00,felec));
1187 /* CUBIC SPLINE TABLE DISPERSION */
1188 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1189 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1190 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1191 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1192 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1193 Heps = _mm256_mul_pd(vfeps,H);
1194 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1195 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1196 fvdw6 = _mm256_mul_pd(c6_00,FF);
1198 /* CUBIC SPLINE TABLE REPULSION */
1199 vfitab = _mm_add_epi32(vfitab,ifour);
1200 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1201 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1202 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1203 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1204 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1205 Heps = _mm256_mul_pd(vfeps,H);
1206 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1207 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1208 fvdw12 = _mm256_mul_pd(c12_00,FF);
1209 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1211 fscal = _mm256_add_pd(felec,fvdw);
1213 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1215 /* Calculate temporary vectorial force */
1216 tx = _mm256_mul_pd(fscal,dx00);
1217 ty = _mm256_mul_pd(fscal,dy00);
1218 tz = _mm256_mul_pd(fscal,dz00);
1220 /* Update vectorial force */
1221 fix0 = _mm256_add_pd(fix0,tx);
1222 fiy0 = _mm256_add_pd(fiy0,ty);
1223 fiz0 = _mm256_add_pd(fiz0,tz);
1225 fjx0 = _mm256_add_pd(fjx0,tx);
1226 fjy0 = _mm256_add_pd(fjy0,ty);
1227 fjz0 = _mm256_add_pd(fjz0,tz);
1229 /**************************
1230 * CALCULATE INTERACTIONS *
1231 **************************/
1233 r10 = _mm256_mul_pd(rsq10,rinv10);
1234 r10 = _mm256_andnot_pd(dummy_mask,r10);
1236 /* Compute parameters for interactions between i and j atoms */
1237 qq10 = _mm256_mul_pd(iq1,jq0);
1239 /* EWALD ELECTROSTATICS */
1241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1242 ewrt = _mm256_mul_pd(r10,ewtabscale);
1243 ewitab = _mm256_cvttpd_epi32(ewrt);
1244 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1245 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1246 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1248 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1249 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1253 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1255 /* Calculate temporary vectorial force */
1256 tx = _mm256_mul_pd(fscal,dx10);
1257 ty = _mm256_mul_pd(fscal,dy10);
1258 tz = _mm256_mul_pd(fscal,dz10);
1260 /* Update vectorial force */
1261 fix1 = _mm256_add_pd(fix1,tx);
1262 fiy1 = _mm256_add_pd(fiy1,ty);
1263 fiz1 = _mm256_add_pd(fiz1,tz);
1265 fjx0 = _mm256_add_pd(fjx0,tx);
1266 fjy0 = _mm256_add_pd(fjy0,ty);
1267 fjz0 = _mm256_add_pd(fjz0,tz);
1269 /**************************
1270 * CALCULATE INTERACTIONS *
1271 **************************/
1273 r20 = _mm256_mul_pd(rsq20,rinv20);
1274 r20 = _mm256_andnot_pd(dummy_mask,r20);
1276 /* Compute parameters for interactions between i and j atoms */
1277 qq20 = _mm256_mul_pd(iq2,jq0);
1279 /* EWALD ELECTROSTATICS */
1281 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1282 ewrt = _mm256_mul_pd(r20,ewtabscale);
1283 ewitab = _mm256_cvttpd_epi32(ewrt);
1284 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1285 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1286 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1288 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1289 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1293 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1295 /* Calculate temporary vectorial force */
1296 tx = _mm256_mul_pd(fscal,dx20);
1297 ty = _mm256_mul_pd(fscal,dy20);
1298 tz = _mm256_mul_pd(fscal,dz20);
1300 /* Update vectorial force */
1301 fix2 = _mm256_add_pd(fix2,tx);
1302 fiy2 = _mm256_add_pd(fiy2,ty);
1303 fiz2 = _mm256_add_pd(fiz2,tz);
1305 fjx0 = _mm256_add_pd(fjx0,tx);
1306 fjy0 = _mm256_add_pd(fjy0,ty);
1307 fjz0 = _mm256_add_pd(fjz0,tz);
1309 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1310 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1311 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1312 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1314 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1316 /* Inner loop uses 140 flops */
1319 /* End of innermost loop */
1321 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1322 f+i_coord_offset,fshift+i_shift_offset);
1324 /* Increment number of inner iterations */
1325 inneriter += j_index_end - j_index_start;
1327 /* Outer loop uses 18 flops */
1330 /* Increment number of outer iterations */
1333 /* Update outer/inner flops */
1335 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*140);