2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_256_double
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRF_VdwCSTab_GeomW4P1_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
81 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
83 __m256d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
84 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
87 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
90 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
91 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
93 __m128i ifour = _mm_set1_epi32(4);
94 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
96 __m256d dummy_mask,cutoff_mask;
97 __m128 tmpmask0,tmpmask1;
98 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
99 __m256d one = _mm256_set1_pd(1.0);
100 __m256d two = _mm256_set1_pd(2.0);
106 jindex = nlist->jindex;
108 shiftidx = nlist->shift;
110 shiftvec = fr->shift_vec[0];
111 fshift = fr->fshift[0];
112 facel = _mm256_set1_pd(fr->epsfac);
113 charge = mdatoms->chargeA;
114 krf = _mm256_set1_pd(fr->ic->k_rf);
115 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
116 crf = _mm256_set1_pd(fr->ic->c_rf);
117 nvdwtype = fr->ntype;
119 vdwtype = mdatoms->typeA;
121 vftab = kernel_data->table_vdw->data;
122 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
124 /* Setup water-specific parameters */
125 inr = nlist->iinr[0];
126 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
127 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
128 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
129 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
131 /* Avoid stupid compiler warnings */
132 jnrA = jnrB = jnrC = jnrD = 0;
141 for(iidx=0;iidx<4*DIM;iidx++)
146 /* Start outer loop over neighborlists */
147 for(iidx=0; iidx<nri; iidx++)
149 /* Load shift vector for this list */
150 i_shift_offset = DIM*shiftidx[iidx];
152 /* Load limits for loop over neighbors */
153 j_index_start = jindex[iidx];
154 j_index_end = jindex[iidx+1];
156 /* Get outer coordinate index */
158 i_coord_offset = DIM*inr;
160 /* Load i particle coords and add shift vector */
161 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
162 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
164 fix0 = _mm256_setzero_pd();
165 fiy0 = _mm256_setzero_pd();
166 fiz0 = _mm256_setzero_pd();
167 fix1 = _mm256_setzero_pd();
168 fiy1 = _mm256_setzero_pd();
169 fiz1 = _mm256_setzero_pd();
170 fix2 = _mm256_setzero_pd();
171 fiy2 = _mm256_setzero_pd();
172 fiz2 = _mm256_setzero_pd();
173 fix3 = _mm256_setzero_pd();
174 fiy3 = _mm256_setzero_pd();
175 fiz3 = _mm256_setzero_pd();
177 /* Reset potential sums */
178 velecsum = _mm256_setzero_pd();
179 vvdwsum = _mm256_setzero_pd();
181 /* Start inner kernel loop */
182 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
185 /* Get j neighbor index, and coordinate index */
190 j_coord_offsetA = DIM*jnrA;
191 j_coord_offsetB = DIM*jnrB;
192 j_coord_offsetC = DIM*jnrC;
193 j_coord_offsetD = DIM*jnrD;
195 /* load j atom coordinates */
196 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
197 x+j_coord_offsetC,x+j_coord_offsetD,
200 /* Calculate displacement vector */
201 dx00 = _mm256_sub_pd(ix0,jx0);
202 dy00 = _mm256_sub_pd(iy0,jy0);
203 dz00 = _mm256_sub_pd(iz0,jz0);
204 dx10 = _mm256_sub_pd(ix1,jx0);
205 dy10 = _mm256_sub_pd(iy1,jy0);
206 dz10 = _mm256_sub_pd(iz1,jz0);
207 dx20 = _mm256_sub_pd(ix2,jx0);
208 dy20 = _mm256_sub_pd(iy2,jy0);
209 dz20 = _mm256_sub_pd(iz2,jz0);
210 dx30 = _mm256_sub_pd(ix3,jx0);
211 dy30 = _mm256_sub_pd(iy3,jy0);
212 dz30 = _mm256_sub_pd(iz3,jz0);
214 /* Calculate squared distance and things based on it */
215 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
216 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
217 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
218 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
220 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
221 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
222 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
223 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
225 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
226 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
227 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
229 /* Load parameters for j particles */
230 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
231 charge+jnrC+0,charge+jnrD+0);
232 vdwjidx0A = 2*vdwtype[jnrA+0];
233 vdwjidx0B = 2*vdwtype[jnrB+0];
234 vdwjidx0C = 2*vdwtype[jnrC+0];
235 vdwjidx0D = 2*vdwtype[jnrD+0];
237 fjx0 = _mm256_setzero_pd();
238 fjy0 = _mm256_setzero_pd();
239 fjz0 = _mm256_setzero_pd();
241 /**************************
242 * CALCULATE INTERACTIONS *
243 **************************/
245 r00 = _mm256_mul_pd(rsq00,rinv00);
247 /* Compute parameters for interactions between i and j atoms */
248 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
249 vdwioffsetptr0+vdwjidx0B,
250 vdwioffsetptr0+vdwjidx0C,
251 vdwioffsetptr0+vdwjidx0D,
254 /* Calculate table index by multiplying r with table scale and truncate to integer */
255 rt = _mm256_mul_pd(r00,vftabscale);
256 vfitab = _mm256_cvttpd_epi32(rt);
257 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
258 vfitab = _mm_slli_epi32(vfitab,3);
260 /* CUBIC SPLINE TABLE DISPERSION */
261 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
262 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
263 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
264 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
265 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
266 Heps = _mm256_mul_pd(vfeps,H);
267 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
268 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
269 vvdw6 = _mm256_mul_pd(c6_00,VV);
270 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
271 fvdw6 = _mm256_mul_pd(c6_00,FF);
273 /* CUBIC SPLINE TABLE REPULSION */
274 vfitab = _mm_add_epi32(vfitab,ifour);
275 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
276 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
277 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
278 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
279 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
280 Heps = _mm256_mul_pd(vfeps,H);
281 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
282 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
283 vvdw12 = _mm256_mul_pd(c12_00,VV);
284 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
285 fvdw12 = _mm256_mul_pd(c12_00,FF);
286 vvdw = _mm256_add_pd(vvdw12,vvdw6);
287 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
289 /* Update potential sum for this i atom from the interaction with this j atom. */
290 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
294 /* Calculate temporary vectorial force */
295 tx = _mm256_mul_pd(fscal,dx00);
296 ty = _mm256_mul_pd(fscal,dy00);
297 tz = _mm256_mul_pd(fscal,dz00);
299 /* Update vectorial force */
300 fix0 = _mm256_add_pd(fix0,tx);
301 fiy0 = _mm256_add_pd(fiy0,ty);
302 fiz0 = _mm256_add_pd(fiz0,tz);
304 fjx0 = _mm256_add_pd(fjx0,tx);
305 fjy0 = _mm256_add_pd(fjy0,ty);
306 fjz0 = _mm256_add_pd(fjz0,tz);
308 /**************************
309 * CALCULATE INTERACTIONS *
310 **************************/
312 /* Compute parameters for interactions between i and j atoms */
313 qq10 = _mm256_mul_pd(iq1,jq0);
315 /* REACTION-FIELD ELECTROSTATICS */
316 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
317 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
319 /* Update potential sum for this i atom from the interaction with this j atom. */
320 velecsum = _mm256_add_pd(velecsum,velec);
324 /* Calculate temporary vectorial force */
325 tx = _mm256_mul_pd(fscal,dx10);
326 ty = _mm256_mul_pd(fscal,dy10);
327 tz = _mm256_mul_pd(fscal,dz10);
329 /* Update vectorial force */
330 fix1 = _mm256_add_pd(fix1,tx);
331 fiy1 = _mm256_add_pd(fiy1,ty);
332 fiz1 = _mm256_add_pd(fiz1,tz);
334 fjx0 = _mm256_add_pd(fjx0,tx);
335 fjy0 = _mm256_add_pd(fjy0,ty);
336 fjz0 = _mm256_add_pd(fjz0,tz);
338 /**************************
339 * CALCULATE INTERACTIONS *
340 **************************/
342 /* Compute parameters for interactions between i and j atoms */
343 qq20 = _mm256_mul_pd(iq2,jq0);
345 /* REACTION-FIELD ELECTROSTATICS */
346 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
347 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 velecsum = _mm256_add_pd(velecsum,velec);
354 /* Calculate temporary vectorial force */
355 tx = _mm256_mul_pd(fscal,dx20);
356 ty = _mm256_mul_pd(fscal,dy20);
357 tz = _mm256_mul_pd(fscal,dz20);
359 /* Update vectorial force */
360 fix2 = _mm256_add_pd(fix2,tx);
361 fiy2 = _mm256_add_pd(fiy2,ty);
362 fiz2 = _mm256_add_pd(fiz2,tz);
364 fjx0 = _mm256_add_pd(fjx0,tx);
365 fjy0 = _mm256_add_pd(fjy0,ty);
366 fjz0 = _mm256_add_pd(fjz0,tz);
368 /**************************
369 * CALCULATE INTERACTIONS *
370 **************************/
372 /* Compute parameters for interactions between i and j atoms */
373 qq30 = _mm256_mul_pd(iq3,jq0);
375 /* REACTION-FIELD ELECTROSTATICS */
376 velec = _mm256_mul_pd(qq30,_mm256_sub_pd(_mm256_add_pd(rinv30,_mm256_mul_pd(krf,rsq30)),crf));
377 felec = _mm256_mul_pd(qq30,_mm256_sub_pd(_mm256_mul_pd(rinv30,rinvsq30),krf2));
379 /* Update potential sum for this i atom from the interaction with this j atom. */
380 velecsum = _mm256_add_pd(velecsum,velec);
384 /* Calculate temporary vectorial force */
385 tx = _mm256_mul_pd(fscal,dx30);
386 ty = _mm256_mul_pd(fscal,dy30);
387 tz = _mm256_mul_pd(fscal,dz30);
389 /* Update vectorial force */
390 fix3 = _mm256_add_pd(fix3,tx);
391 fiy3 = _mm256_add_pd(fiy3,ty);
392 fiz3 = _mm256_add_pd(fiz3,tz);
394 fjx0 = _mm256_add_pd(fjx0,tx);
395 fjy0 = _mm256_add_pd(fjy0,ty);
396 fjz0 = _mm256_add_pd(fjz0,tz);
398 fjptrA = f+j_coord_offsetA;
399 fjptrB = f+j_coord_offsetB;
400 fjptrC = f+j_coord_offsetC;
401 fjptrD = f+j_coord_offsetD;
403 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
405 /* Inner loop uses 155 flops */
411 /* Get j neighbor index, and coordinate index */
412 jnrlistA = jjnr[jidx];
413 jnrlistB = jjnr[jidx+1];
414 jnrlistC = jjnr[jidx+2];
415 jnrlistD = jjnr[jidx+3];
416 /* Sign of each element will be negative for non-real atoms.
417 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
418 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
420 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
422 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
423 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
424 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
426 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
427 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
428 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
429 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
430 j_coord_offsetA = DIM*jnrA;
431 j_coord_offsetB = DIM*jnrB;
432 j_coord_offsetC = DIM*jnrC;
433 j_coord_offsetD = DIM*jnrD;
435 /* load j atom coordinates */
436 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
437 x+j_coord_offsetC,x+j_coord_offsetD,
440 /* Calculate displacement vector */
441 dx00 = _mm256_sub_pd(ix0,jx0);
442 dy00 = _mm256_sub_pd(iy0,jy0);
443 dz00 = _mm256_sub_pd(iz0,jz0);
444 dx10 = _mm256_sub_pd(ix1,jx0);
445 dy10 = _mm256_sub_pd(iy1,jy0);
446 dz10 = _mm256_sub_pd(iz1,jz0);
447 dx20 = _mm256_sub_pd(ix2,jx0);
448 dy20 = _mm256_sub_pd(iy2,jy0);
449 dz20 = _mm256_sub_pd(iz2,jz0);
450 dx30 = _mm256_sub_pd(ix3,jx0);
451 dy30 = _mm256_sub_pd(iy3,jy0);
452 dz30 = _mm256_sub_pd(iz3,jz0);
454 /* Calculate squared distance and things based on it */
455 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
456 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
457 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
458 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
460 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
461 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
462 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
463 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
465 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
466 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
467 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
469 /* Load parameters for j particles */
470 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
471 charge+jnrC+0,charge+jnrD+0);
472 vdwjidx0A = 2*vdwtype[jnrA+0];
473 vdwjidx0B = 2*vdwtype[jnrB+0];
474 vdwjidx0C = 2*vdwtype[jnrC+0];
475 vdwjidx0D = 2*vdwtype[jnrD+0];
477 fjx0 = _mm256_setzero_pd();
478 fjy0 = _mm256_setzero_pd();
479 fjz0 = _mm256_setzero_pd();
481 /**************************
482 * CALCULATE INTERACTIONS *
483 **************************/
485 r00 = _mm256_mul_pd(rsq00,rinv00);
486 r00 = _mm256_andnot_pd(dummy_mask,r00);
488 /* Compute parameters for interactions between i and j atoms */
489 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
490 vdwioffsetptr0+vdwjidx0B,
491 vdwioffsetptr0+vdwjidx0C,
492 vdwioffsetptr0+vdwjidx0D,
495 /* Calculate table index by multiplying r with table scale and truncate to integer */
496 rt = _mm256_mul_pd(r00,vftabscale);
497 vfitab = _mm256_cvttpd_epi32(rt);
498 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
499 vfitab = _mm_slli_epi32(vfitab,3);
501 /* CUBIC SPLINE TABLE DISPERSION */
502 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
503 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
504 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
505 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
506 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
507 Heps = _mm256_mul_pd(vfeps,H);
508 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
509 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
510 vvdw6 = _mm256_mul_pd(c6_00,VV);
511 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
512 fvdw6 = _mm256_mul_pd(c6_00,FF);
514 /* CUBIC SPLINE TABLE REPULSION */
515 vfitab = _mm_add_epi32(vfitab,ifour);
516 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
517 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
518 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
519 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
520 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
521 Heps = _mm256_mul_pd(vfeps,H);
522 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
523 VV = _mm256_add_pd(Y,_mm256_mul_pd(vfeps,Fp));
524 vvdw12 = _mm256_mul_pd(c12_00,VV);
525 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
526 fvdw12 = _mm256_mul_pd(c12_00,FF);
527 vvdw = _mm256_add_pd(vvdw12,vvdw6);
528 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
530 /* Update potential sum for this i atom from the interaction with this j atom. */
531 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
532 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
536 fscal = _mm256_andnot_pd(dummy_mask,fscal);
538 /* Calculate temporary vectorial force */
539 tx = _mm256_mul_pd(fscal,dx00);
540 ty = _mm256_mul_pd(fscal,dy00);
541 tz = _mm256_mul_pd(fscal,dz00);
543 /* Update vectorial force */
544 fix0 = _mm256_add_pd(fix0,tx);
545 fiy0 = _mm256_add_pd(fiy0,ty);
546 fiz0 = _mm256_add_pd(fiz0,tz);
548 fjx0 = _mm256_add_pd(fjx0,tx);
549 fjy0 = _mm256_add_pd(fjy0,ty);
550 fjz0 = _mm256_add_pd(fjz0,tz);
552 /**************************
553 * CALCULATE INTERACTIONS *
554 **************************/
556 /* Compute parameters for interactions between i and j atoms */
557 qq10 = _mm256_mul_pd(iq1,jq0);
559 /* REACTION-FIELD ELECTROSTATICS */
560 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_add_pd(rinv10,_mm256_mul_pd(krf,rsq10)),crf));
561 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
563 /* Update potential sum for this i atom from the interaction with this j atom. */
564 velec = _mm256_andnot_pd(dummy_mask,velec);
565 velecsum = _mm256_add_pd(velecsum,velec);
569 fscal = _mm256_andnot_pd(dummy_mask,fscal);
571 /* Calculate temporary vectorial force */
572 tx = _mm256_mul_pd(fscal,dx10);
573 ty = _mm256_mul_pd(fscal,dy10);
574 tz = _mm256_mul_pd(fscal,dz10);
576 /* Update vectorial force */
577 fix1 = _mm256_add_pd(fix1,tx);
578 fiy1 = _mm256_add_pd(fiy1,ty);
579 fiz1 = _mm256_add_pd(fiz1,tz);
581 fjx0 = _mm256_add_pd(fjx0,tx);
582 fjy0 = _mm256_add_pd(fjy0,ty);
583 fjz0 = _mm256_add_pd(fjz0,tz);
585 /**************************
586 * CALCULATE INTERACTIONS *
587 **************************/
589 /* Compute parameters for interactions between i and j atoms */
590 qq20 = _mm256_mul_pd(iq2,jq0);
592 /* REACTION-FIELD ELECTROSTATICS */
593 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_add_pd(rinv20,_mm256_mul_pd(krf,rsq20)),crf));
594 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
596 /* Update potential sum for this i atom from the interaction with this j atom. */
597 velec = _mm256_andnot_pd(dummy_mask,velec);
598 velecsum = _mm256_add_pd(velecsum,velec);
602 fscal = _mm256_andnot_pd(dummy_mask,fscal);
604 /* Calculate temporary vectorial force */
605 tx = _mm256_mul_pd(fscal,dx20);
606 ty = _mm256_mul_pd(fscal,dy20);
607 tz = _mm256_mul_pd(fscal,dz20);
609 /* Update vectorial force */
610 fix2 = _mm256_add_pd(fix2,tx);
611 fiy2 = _mm256_add_pd(fiy2,ty);
612 fiz2 = _mm256_add_pd(fiz2,tz);
614 fjx0 = _mm256_add_pd(fjx0,tx);
615 fjy0 = _mm256_add_pd(fjy0,ty);
616 fjz0 = _mm256_add_pd(fjz0,tz);
618 /**************************
619 * CALCULATE INTERACTIONS *
620 **************************/
622 /* Compute parameters for interactions between i and j atoms */
623 qq30 = _mm256_mul_pd(iq3,jq0);
625 /* REACTION-FIELD ELECTROSTATICS */
626 velec = _mm256_mul_pd(qq30,_mm256_sub_pd(_mm256_add_pd(rinv30,_mm256_mul_pd(krf,rsq30)),crf));
627 felec = _mm256_mul_pd(qq30,_mm256_sub_pd(_mm256_mul_pd(rinv30,rinvsq30),krf2));
629 /* Update potential sum for this i atom from the interaction with this j atom. */
630 velec = _mm256_andnot_pd(dummy_mask,velec);
631 velecsum = _mm256_add_pd(velecsum,velec);
635 fscal = _mm256_andnot_pd(dummy_mask,fscal);
637 /* Calculate temporary vectorial force */
638 tx = _mm256_mul_pd(fscal,dx30);
639 ty = _mm256_mul_pd(fscal,dy30);
640 tz = _mm256_mul_pd(fscal,dz30);
642 /* Update vectorial force */
643 fix3 = _mm256_add_pd(fix3,tx);
644 fiy3 = _mm256_add_pd(fiy3,ty);
645 fiz3 = _mm256_add_pd(fiz3,tz);
647 fjx0 = _mm256_add_pd(fjx0,tx);
648 fjy0 = _mm256_add_pd(fjy0,ty);
649 fjz0 = _mm256_add_pd(fjz0,tz);
651 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
652 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
653 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
654 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
656 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
658 /* Inner loop uses 156 flops */
661 /* End of innermost loop */
663 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
664 f+i_coord_offset,fshift+i_shift_offset);
667 /* Update potential energies */
668 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
669 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
671 /* Increment number of inner iterations */
672 inneriter += j_index_end - j_index_start;
674 /* Outer loop uses 26 flops */
677 /* Increment number of outer iterations */
680 /* Update outer/inner flops */
682 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*156);
685 * Gromacs nonbonded kernel: nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_256_double
686 * Electrostatics interaction: ReactionField
687 * VdW interaction: CubicSplineTable
688 * Geometry: Water4-Particle
689 * Calculate force/pot: Force
692 nb_kernel_ElecRF_VdwCSTab_GeomW4P1_F_avx_256_double
693 (t_nblist * gmx_restrict nlist,
694 rvec * gmx_restrict xx,
695 rvec * gmx_restrict ff,
696 t_forcerec * gmx_restrict fr,
697 t_mdatoms * gmx_restrict mdatoms,
698 nb_kernel_data_t * gmx_restrict kernel_data,
699 t_nrnb * gmx_restrict nrnb)
701 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
702 * just 0 for non-waters.
703 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
704 * jnr indices corresponding to data put in the four positions in the SIMD register.
706 int i_shift_offset,i_coord_offset,outeriter,inneriter;
707 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
708 int jnrA,jnrB,jnrC,jnrD;
709 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
710 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
711 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
712 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
714 real *shiftvec,*fshift,*x,*f;
715 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
717 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
718 real * vdwioffsetptr0;
719 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
720 real * vdwioffsetptr1;
721 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
722 real * vdwioffsetptr2;
723 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
724 real * vdwioffsetptr3;
725 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
726 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
727 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
728 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
729 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
730 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
731 __m256d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
732 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
735 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
738 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
739 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
741 __m128i ifour = _mm_set1_epi32(4);
742 __m256d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF;
744 __m256d dummy_mask,cutoff_mask;
745 __m128 tmpmask0,tmpmask1;
746 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
747 __m256d one = _mm256_set1_pd(1.0);
748 __m256d two = _mm256_set1_pd(2.0);
754 jindex = nlist->jindex;
756 shiftidx = nlist->shift;
758 shiftvec = fr->shift_vec[0];
759 fshift = fr->fshift[0];
760 facel = _mm256_set1_pd(fr->epsfac);
761 charge = mdatoms->chargeA;
762 krf = _mm256_set1_pd(fr->ic->k_rf);
763 krf2 = _mm256_set1_pd(fr->ic->k_rf*2.0);
764 crf = _mm256_set1_pd(fr->ic->c_rf);
765 nvdwtype = fr->ntype;
767 vdwtype = mdatoms->typeA;
769 vftab = kernel_data->table_vdw->data;
770 vftabscale = _mm256_set1_pd(kernel_data->table_vdw->scale);
772 /* Setup water-specific parameters */
773 inr = nlist->iinr[0];
774 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
775 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
776 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
777 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
779 /* Avoid stupid compiler warnings */
780 jnrA = jnrB = jnrC = jnrD = 0;
789 for(iidx=0;iidx<4*DIM;iidx++)
794 /* Start outer loop over neighborlists */
795 for(iidx=0; iidx<nri; iidx++)
797 /* Load shift vector for this list */
798 i_shift_offset = DIM*shiftidx[iidx];
800 /* Load limits for loop over neighbors */
801 j_index_start = jindex[iidx];
802 j_index_end = jindex[iidx+1];
804 /* Get outer coordinate index */
806 i_coord_offset = DIM*inr;
808 /* Load i particle coords and add shift vector */
809 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
810 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
812 fix0 = _mm256_setzero_pd();
813 fiy0 = _mm256_setzero_pd();
814 fiz0 = _mm256_setzero_pd();
815 fix1 = _mm256_setzero_pd();
816 fiy1 = _mm256_setzero_pd();
817 fiz1 = _mm256_setzero_pd();
818 fix2 = _mm256_setzero_pd();
819 fiy2 = _mm256_setzero_pd();
820 fiz2 = _mm256_setzero_pd();
821 fix3 = _mm256_setzero_pd();
822 fiy3 = _mm256_setzero_pd();
823 fiz3 = _mm256_setzero_pd();
825 /* Start inner kernel loop */
826 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
829 /* Get j neighbor index, and coordinate index */
834 j_coord_offsetA = DIM*jnrA;
835 j_coord_offsetB = DIM*jnrB;
836 j_coord_offsetC = DIM*jnrC;
837 j_coord_offsetD = DIM*jnrD;
839 /* load j atom coordinates */
840 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
841 x+j_coord_offsetC,x+j_coord_offsetD,
844 /* Calculate displacement vector */
845 dx00 = _mm256_sub_pd(ix0,jx0);
846 dy00 = _mm256_sub_pd(iy0,jy0);
847 dz00 = _mm256_sub_pd(iz0,jz0);
848 dx10 = _mm256_sub_pd(ix1,jx0);
849 dy10 = _mm256_sub_pd(iy1,jy0);
850 dz10 = _mm256_sub_pd(iz1,jz0);
851 dx20 = _mm256_sub_pd(ix2,jx0);
852 dy20 = _mm256_sub_pd(iy2,jy0);
853 dz20 = _mm256_sub_pd(iz2,jz0);
854 dx30 = _mm256_sub_pd(ix3,jx0);
855 dy30 = _mm256_sub_pd(iy3,jy0);
856 dz30 = _mm256_sub_pd(iz3,jz0);
858 /* Calculate squared distance and things based on it */
859 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
860 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
861 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
862 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
864 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
865 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
866 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
867 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
869 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
870 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
871 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
873 /* Load parameters for j particles */
874 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
875 charge+jnrC+0,charge+jnrD+0);
876 vdwjidx0A = 2*vdwtype[jnrA+0];
877 vdwjidx0B = 2*vdwtype[jnrB+0];
878 vdwjidx0C = 2*vdwtype[jnrC+0];
879 vdwjidx0D = 2*vdwtype[jnrD+0];
881 fjx0 = _mm256_setzero_pd();
882 fjy0 = _mm256_setzero_pd();
883 fjz0 = _mm256_setzero_pd();
885 /**************************
886 * CALCULATE INTERACTIONS *
887 **************************/
889 r00 = _mm256_mul_pd(rsq00,rinv00);
891 /* Compute parameters for interactions between i and j atoms */
892 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
893 vdwioffsetptr0+vdwjidx0B,
894 vdwioffsetptr0+vdwjidx0C,
895 vdwioffsetptr0+vdwjidx0D,
898 /* Calculate table index by multiplying r with table scale and truncate to integer */
899 rt = _mm256_mul_pd(r00,vftabscale);
900 vfitab = _mm256_cvttpd_epi32(rt);
901 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
902 vfitab = _mm_slli_epi32(vfitab,3);
904 /* CUBIC SPLINE TABLE DISPERSION */
905 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
906 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
907 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
908 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
909 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
910 Heps = _mm256_mul_pd(vfeps,H);
911 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
912 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
913 fvdw6 = _mm256_mul_pd(c6_00,FF);
915 /* CUBIC SPLINE TABLE REPULSION */
916 vfitab = _mm_add_epi32(vfitab,ifour);
917 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
918 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
919 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
920 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
921 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
922 Heps = _mm256_mul_pd(vfeps,H);
923 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
924 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
925 fvdw12 = _mm256_mul_pd(c12_00,FF);
926 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
930 /* Calculate temporary vectorial force */
931 tx = _mm256_mul_pd(fscal,dx00);
932 ty = _mm256_mul_pd(fscal,dy00);
933 tz = _mm256_mul_pd(fscal,dz00);
935 /* Update vectorial force */
936 fix0 = _mm256_add_pd(fix0,tx);
937 fiy0 = _mm256_add_pd(fiy0,ty);
938 fiz0 = _mm256_add_pd(fiz0,tz);
940 fjx0 = _mm256_add_pd(fjx0,tx);
941 fjy0 = _mm256_add_pd(fjy0,ty);
942 fjz0 = _mm256_add_pd(fjz0,tz);
944 /**************************
945 * CALCULATE INTERACTIONS *
946 **************************/
948 /* Compute parameters for interactions between i and j atoms */
949 qq10 = _mm256_mul_pd(iq1,jq0);
951 /* REACTION-FIELD ELECTROSTATICS */
952 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
956 /* Calculate temporary vectorial force */
957 tx = _mm256_mul_pd(fscal,dx10);
958 ty = _mm256_mul_pd(fscal,dy10);
959 tz = _mm256_mul_pd(fscal,dz10);
961 /* Update vectorial force */
962 fix1 = _mm256_add_pd(fix1,tx);
963 fiy1 = _mm256_add_pd(fiy1,ty);
964 fiz1 = _mm256_add_pd(fiz1,tz);
966 fjx0 = _mm256_add_pd(fjx0,tx);
967 fjy0 = _mm256_add_pd(fjy0,ty);
968 fjz0 = _mm256_add_pd(fjz0,tz);
970 /**************************
971 * CALCULATE INTERACTIONS *
972 **************************/
974 /* Compute parameters for interactions between i and j atoms */
975 qq20 = _mm256_mul_pd(iq2,jq0);
977 /* REACTION-FIELD ELECTROSTATICS */
978 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
982 /* Calculate temporary vectorial force */
983 tx = _mm256_mul_pd(fscal,dx20);
984 ty = _mm256_mul_pd(fscal,dy20);
985 tz = _mm256_mul_pd(fscal,dz20);
987 /* Update vectorial force */
988 fix2 = _mm256_add_pd(fix2,tx);
989 fiy2 = _mm256_add_pd(fiy2,ty);
990 fiz2 = _mm256_add_pd(fiz2,tz);
992 fjx0 = _mm256_add_pd(fjx0,tx);
993 fjy0 = _mm256_add_pd(fjy0,ty);
994 fjz0 = _mm256_add_pd(fjz0,tz);
996 /**************************
997 * CALCULATE INTERACTIONS *
998 **************************/
1000 /* Compute parameters for interactions between i and j atoms */
1001 qq30 = _mm256_mul_pd(iq3,jq0);
1003 /* REACTION-FIELD ELECTROSTATICS */
1004 felec = _mm256_mul_pd(qq30,_mm256_sub_pd(_mm256_mul_pd(rinv30,rinvsq30),krf2));
1008 /* Calculate temporary vectorial force */
1009 tx = _mm256_mul_pd(fscal,dx30);
1010 ty = _mm256_mul_pd(fscal,dy30);
1011 tz = _mm256_mul_pd(fscal,dz30);
1013 /* Update vectorial force */
1014 fix3 = _mm256_add_pd(fix3,tx);
1015 fiy3 = _mm256_add_pd(fiy3,ty);
1016 fiz3 = _mm256_add_pd(fiz3,tz);
1018 fjx0 = _mm256_add_pd(fjx0,tx);
1019 fjy0 = _mm256_add_pd(fjy0,ty);
1020 fjz0 = _mm256_add_pd(fjz0,tz);
1022 fjptrA = f+j_coord_offsetA;
1023 fjptrB = f+j_coord_offsetB;
1024 fjptrC = f+j_coord_offsetC;
1025 fjptrD = f+j_coord_offsetD;
1027 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1029 /* Inner loop uses 132 flops */
1032 if(jidx<j_index_end)
1035 /* Get j neighbor index, and coordinate index */
1036 jnrlistA = jjnr[jidx];
1037 jnrlistB = jjnr[jidx+1];
1038 jnrlistC = jjnr[jidx+2];
1039 jnrlistD = jjnr[jidx+3];
1040 /* Sign of each element will be negative for non-real atoms.
1041 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1042 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1044 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1046 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1047 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1048 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1050 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1051 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1052 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1053 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1054 j_coord_offsetA = DIM*jnrA;
1055 j_coord_offsetB = DIM*jnrB;
1056 j_coord_offsetC = DIM*jnrC;
1057 j_coord_offsetD = DIM*jnrD;
1059 /* load j atom coordinates */
1060 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1061 x+j_coord_offsetC,x+j_coord_offsetD,
1064 /* Calculate displacement vector */
1065 dx00 = _mm256_sub_pd(ix0,jx0);
1066 dy00 = _mm256_sub_pd(iy0,jy0);
1067 dz00 = _mm256_sub_pd(iz0,jz0);
1068 dx10 = _mm256_sub_pd(ix1,jx0);
1069 dy10 = _mm256_sub_pd(iy1,jy0);
1070 dz10 = _mm256_sub_pd(iz1,jz0);
1071 dx20 = _mm256_sub_pd(ix2,jx0);
1072 dy20 = _mm256_sub_pd(iy2,jy0);
1073 dz20 = _mm256_sub_pd(iz2,jz0);
1074 dx30 = _mm256_sub_pd(ix3,jx0);
1075 dy30 = _mm256_sub_pd(iy3,jy0);
1076 dz30 = _mm256_sub_pd(iz3,jz0);
1078 /* Calculate squared distance and things based on it */
1079 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1080 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1081 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1082 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
1084 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1085 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1086 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1087 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
1089 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1090 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1091 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
1093 /* Load parameters for j particles */
1094 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1095 charge+jnrC+0,charge+jnrD+0);
1096 vdwjidx0A = 2*vdwtype[jnrA+0];
1097 vdwjidx0B = 2*vdwtype[jnrB+0];
1098 vdwjidx0C = 2*vdwtype[jnrC+0];
1099 vdwjidx0D = 2*vdwtype[jnrD+0];
1101 fjx0 = _mm256_setzero_pd();
1102 fjy0 = _mm256_setzero_pd();
1103 fjz0 = _mm256_setzero_pd();
1105 /**************************
1106 * CALCULATE INTERACTIONS *
1107 **************************/
1109 r00 = _mm256_mul_pd(rsq00,rinv00);
1110 r00 = _mm256_andnot_pd(dummy_mask,r00);
1112 /* Compute parameters for interactions between i and j atoms */
1113 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1114 vdwioffsetptr0+vdwjidx0B,
1115 vdwioffsetptr0+vdwjidx0C,
1116 vdwioffsetptr0+vdwjidx0D,
1119 /* Calculate table index by multiplying r with table scale and truncate to integer */
1120 rt = _mm256_mul_pd(r00,vftabscale);
1121 vfitab = _mm256_cvttpd_epi32(rt);
1122 vfeps = _mm256_sub_pd(rt,_mm256_round_pd(rt, _MM_FROUND_FLOOR));
1123 vfitab = _mm_slli_epi32(vfitab,3);
1125 /* CUBIC SPLINE TABLE DISPERSION */
1126 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1127 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1128 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1129 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1130 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1131 Heps = _mm256_mul_pd(vfeps,H);
1132 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1133 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1134 fvdw6 = _mm256_mul_pd(c6_00,FF);
1136 /* CUBIC SPLINE TABLE REPULSION */
1137 vfitab = _mm_add_epi32(vfitab,ifour);
1138 Y = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1139 F = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
1140 G = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,2) );
1141 H = _mm256_load_pd( vftab + _mm_extract_epi32(vfitab,3) );
1142 GMX_MM256_FULLTRANSPOSE4_PD(Y,F,G,H);
1143 Heps = _mm256_mul_pd(vfeps,H);
1144 Fp = _mm256_add_pd(F,_mm256_mul_pd(vfeps,_mm256_add_pd(G,Heps)));
1145 FF = _mm256_add_pd(Fp,_mm256_mul_pd(vfeps,_mm256_add_pd(G,_mm256_add_pd(Heps,Heps))));
1146 fvdw12 = _mm256_mul_pd(c12_00,FF);
1147 fvdw = _mm256_xor_pd(signbit,_mm256_mul_pd(_mm256_add_pd(fvdw6,fvdw12),_mm256_mul_pd(vftabscale,rinv00)));
1151 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1153 /* Calculate temporary vectorial force */
1154 tx = _mm256_mul_pd(fscal,dx00);
1155 ty = _mm256_mul_pd(fscal,dy00);
1156 tz = _mm256_mul_pd(fscal,dz00);
1158 /* Update vectorial force */
1159 fix0 = _mm256_add_pd(fix0,tx);
1160 fiy0 = _mm256_add_pd(fiy0,ty);
1161 fiz0 = _mm256_add_pd(fiz0,tz);
1163 fjx0 = _mm256_add_pd(fjx0,tx);
1164 fjy0 = _mm256_add_pd(fjy0,ty);
1165 fjz0 = _mm256_add_pd(fjz0,tz);
1167 /**************************
1168 * CALCULATE INTERACTIONS *
1169 **************************/
1171 /* Compute parameters for interactions between i and j atoms */
1172 qq10 = _mm256_mul_pd(iq1,jq0);
1174 /* REACTION-FIELD ELECTROSTATICS */
1175 felec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_mul_pd(rinv10,rinvsq10),krf2));
1179 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1181 /* Calculate temporary vectorial force */
1182 tx = _mm256_mul_pd(fscal,dx10);
1183 ty = _mm256_mul_pd(fscal,dy10);
1184 tz = _mm256_mul_pd(fscal,dz10);
1186 /* Update vectorial force */
1187 fix1 = _mm256_add_pd(fix1,tx);
1188 fiy1 = _mm256_add_pd(fiy1,ty);
1189 fiz1 = _mm256_add_pd(fiz1,tz);
1191 fjx0 = _mm256_add_pd(fjx0,tx);
1192 fjy0 = _mm256_add_pd(fjy0,ty);
1193 fjz0 = _mm256_add_pd(fjz0,tz);
1195 /**************************
1196 * CALCULATE INTERACTIONS *
1197 **************************/
1199 /* Compute parameters for interactions between i and j atoms */
1200 qq20 = _mm256_mul_pd(iq2,jq0);
1202 /* REACTION-FIELD ELECTROSTATICS */
1203 felec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_mul_pd(rinv20,rinvsq20),krf2));
1207 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1209 /* Calculate temporary vectorial force */
1210 tx = _mm256_mul_pd(fscal,dx20);
1211 ty = _mm256_mul_pd(fscal,dy20);
1212 tz = _mm256_mul_pd(fscal,dz20);
1214 /* Update vectorial force */
1215 fix2 = _mm256_add_pd(fix2,tx);
1216 fiy2 = _mm256_add_pd(fiy2,ty);
1217 fiz2 = _mm256_add_pd(fiz2,tz);
1219 fjx0 = _mm256_add_pd(fjx0,tx);
1220 fjy0 = _mm256_add_pd(fjy0,ty);
1221 fjz0 = _mm256_add_pd(fjz0,tz);
1223 /**************************
1224 * CALCULATE INTERACTIONS *
1225 **************************/
1227 /* Compute parameters for interactions between i and j atoms */
1228 qq30 = _mm256_mul_pd(iq3,jq0);
1230 /* REACTION-FIELD ELECTROSTATICS */
1231 felec = _mm256_mul_pd(qq30,_mm256_sub_pd(_mm256_mul_pd(rinv30,rinvsq30),krf2));
1235 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1237 /* Calculate temporary vectorial force */
1238 tx = _mm256_mul_pd(fscal,dx30);
1239 ty = _mm256_mul_pd(fscal,dy30);
1240 tz = _mm256_mul_pd(fscal,dz30);
1242 /* Update vectorial force */
1243 fix3 = _mm256_add_pd(fix3,tx);
1244 fiy3 = _mm256_add_pd(fiy3,ty);
1245 fiz3 = _mm256_add_pd(fiz3,tz);
1247 fjx0 = _mm256_add_pd(fjx0,tx);
1248 fjy0 = _mm256_add_pd(fjy0,ty);
1249 fjz0 = _mm256_add_pd(fjz0,tz);
1251 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1252 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1253 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1254 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1256 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1258 /* Inner loop uses 133 flops */
1261 /* End of innermost loop */
1263 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1264 f+i_coord_offset,fshift+i_shift_offset);
1266 /* Increment number of inner iterations */
1267 inneriter += j_index_end - j_index_start;
1269 /* Outer loop uses 24 flops */
1272 /* Increment number of outer iterations */
1275 /* Update outer/inner flops */
1277 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*133);