2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_256_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_256_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_256_double
51 * Electrostatics interaction: Coulomb
52 * VdW interaction: LennardJones
53 * Geometry: Water4-Water4
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_256_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
73 int jnrA,jnrB,jnrC,jnrD;
74 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
75 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
76 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
77 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
79 real *shiftvec,*fshift,*x,*f;
80 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
82 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
83 real * vdwioffsetptr0;
84 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
85 real * vdwioffsetptr1;
86 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
87 real * vdwioffsetptr2;
88 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
89 real * vdwioffsetptr3;
90 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
91 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
92 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
93 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
94 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
95 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
96 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
97 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
98 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
99 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
100 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
101 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
102 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
103 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
104 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
105 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
106 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
107 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
108 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
109 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
112 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
115 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
116 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
117 __m256d dummy_mask,cutoff_mask;
118 __m128 tmpmask0,tmpmask1;
119 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
120 __m256d one = _mm256_set1_pd(1.0);
121 __m256d two = _mm256_set1_pd(2.0);
127 jindex = nlist->jindex;
129 shiftidx = nlist->shift;
131 shiftvec = fr->shift_vec[0];
132 fshift = fr->fshift[0];
133 facel = _mm256_set1_pd(fr->ic->epsfac);
134 charge = mdatoms->chargeA;
135 nvdwtype = fr->ntype;
137 vdwtype = mdatoms->typeA;
139 /* Setup water-specific parameters */
140 inr = nlist->iinr[0];
141 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
142 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
143 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
144 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
146 jq1 = _mm256_set1_pd(charge[inr+1]);
147 jq2 = _mm256_set1_pd(charge[inr+2]);
148 jq3 = _mm256_set1_pd(charge[inr+3]);
149 vdwjidx0A = 2*vdwtype[inr+0];
150 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
151 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
152 qq11 = _mm256_mul_pd(iq1,jq1);
153 qq12 = _mm256_mul_pd(iq1,jq2);
154 qq13 = _mm256_mul_pd(iq1,jq3);
155 qq21 = _mm256_mul_pd(iq2,jq1);
156 qq22 = _mm256_mul_pd(iq2,jq2);
157 qq23 = _mm256_mul_pd(iq2,jq3);
158 qq31 = _mm256_mul_pd(iq3,jq1);
159 qq32 = _mm256_mul_pd(iq3,jq2);
160 qq33 = _mm256_mul_pd(iq3,jq3);
162 /* Avoid stupid compiler warnings */
163 jnrA = jnrB = jnrC = jnrD = 0;
172 for(iidx=0;iidx<4*DIM;iidx++)
177 /* Start outer loop over neighborlists */
178 for(iidx=0; iidx<nri; iidx++)
180 /* Load shift vector for this list */
181 i_shift_offset = DIM*shiftidx[iidx];
183 /* Load limits for loop over neighbors */
184 j_index_start = jindex[iidx];
185 j_index_end = jindex[iidx+1];
187 /* Get outer coordinate index */
189 i_coord_offset = DIM*inr;
191 /* Load i particle coords and add shift vector */
192 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
193 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
195 fix0 = _mm256_setzero_pd();
196 fiy0 = _mm256_setzero_pd();
197 fiz0 = _mm256_setzero_pd();
198 fix1 = _mm256_setzero_pd();
199 fiy1 = _mm256_setzero_pd();
200 fiz1 = _mm256_setzero_pd();
201 fix2 = _mm256_setzero_pd();
202 fiy2 = _mm256_setzero_pd();
203 fiz2 = _mm256_setzero_pd();
204 fix3 = _mm256_setzero_pd();
205 fiy3 = _mm256_setzero_pd();
206 fiz3 = _mm256_setzero_pd();
208 /* Reset potential sums */
209 velecsum = _mm256_setzero_pd();
210 vvdwsum = _mm256_setzero_pd();
212 /* Start inner kernel loop */
213 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
216 /* Get j neighbor index, and coordinate index */
221 j_coord_offsetA = DIM*jnrA;
222 j_coord_offsetB = DIM*jnrB;
223 j_coord_offsetC = DIM*jnrC;
224 j_coord_offsetD = DIM*jnrD;
226 /* load j atom coordinates */
227 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
228 x+j_coord_offsetC,x+j_coord_offsetD,
229 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
230 &jy2,&jz2,&jx3,&jy3,&jz3);
232 /* Calculate displacement vector */
233 dx00 = _mm256_sub_pd(ix0,jx0);
234 dy00 = _mm256_sub_pd(iy0,jy0);
235 dz00 = _mm256_sub_pd(iz0,jz0);
236 dx11 = _mm256_sub_pd(ix1,jx1);
237 dy11 = _mm256_sub_pd(iy1,jy1);
238 dz11 = _mm256_sub_pd(iz1,jz1);
239 dx12 = _mm256_sub_pd(ix1,jx2);
240 dy12 = _mm256_sub_pd(iy1,jy2);
241 dz12 = _mm256_sub_pd(iz1,jz2);
242 dx13 = _mm256_sub_pd(ix1,jx3);
243 dy13 = _mm256_sub_pd(iy1,jy3);
244 dz13 = _mm256_sub_pd(iz1,jz3);
245 dx21 = _mm256_sub_pd(ix2,jx1);
246 dy21 = _mm256_sub_pd(iy2,jy1);
247 dz21 = _mm256_sub_pd(iz2,jz1);
248 dx22 = _mm256_sub_pd(ix2,jx2);
249 dy22 = _mm256_sub_pd(iy2,jy2);
250 dz22 = _mm256_sub_pd(iz2,jz2);
251 dx23 = _mm256_sub_pd(ix2,jx3);
252 dy23 = _mm256_sub_pd(iy2,jy3);
253 dz23 = _mm256_sub_pd(iz2,jz3);
254 dx31 = _mm256_sub_pd(ix3,jx1);
255 dy31 = _mm256_sub_pd(iy3,jy1);
256 dz31 = _mm256_sub_pd(iz3,jz1);
257 dx32 = _mm256_sub_pd(ix3,jx2);
258 dy32 = _mm256_sub_pd(iy3,jy2);
259 dz32 = _mm256_sub_pd(iz3,jz2);
260 dx33 = _mm256_sub_pd(ix3,jx3);
261 dy33 = _mm256_sub_pd(iy3,jy3);
262 dz33 = _mm256_sub_pd(iz3,jz3);
264 /* Calculate squared distance and things based on it */
265 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
266 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
267 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
268 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
269 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
270 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
271 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
272 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
273 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
274 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
276 rinv11 = avx256_invsqrt_d(rsq11);
277 rinv12 = avx256_invsqrt_d(rsq12);
278 rinv13 = avx256_invsqrt_d(rsq13);
279 rinv21 = avx256_invsqrt_d(rsq21);
280 rinv22 = avx256_invsqrt_d(rsq22);
281 rinv23 = avx256_invsqrt_d(rsq23);
282 rinv31 = avx256_invsqrt_d(rsq31);
283 rinv32 = avx256_invsqrt_d(rsq32);
284 rinv33 = avx256_invsqrt_d(rsq33);
286 rinvsq00 = avx256_inv_d(rsq00);
287 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
288 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
289 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
290 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
291 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
292 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
293 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
294 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
295 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
297 fjx0 = _mm256_setzero_pd();
298 fjy0 = _mm256_setzero_pd();
299 fjz0 = _mm256_setzero_pd();
300 fjx1 = _mm256_setzero_pd();
301 fjy1 = _mm256_setzero_pd();
302 fjz1 = _mm256_setzero_pd();
303 fjx2 = _mm256_setzero_pd();
304 fjy2 = _mm256_setzero_pd();
305 fjz2 = _mm256_setzero_pd();
306 fjx3 = _mm256_setzero_pd();
307 fjy3 = _mm256_setzero_pd();
308 fjz3 = _mm256_setzero_pd();
310 /**************************
311 * CALCULATE INTERACTIONS *
312 **************************/
314 /* LENNARD-JONES DISPERSION/REPULSION */
316 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
317 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
318 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
319 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
320 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
322 /* Update potential sum for this i atom from the interaction with this j atom. */
323 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
327 /* Calculate temporary vectorial force */
328 tx = _mm256_mul_pd(fscal,dx00);
329 ty = _mm256_mul_pd(fscal,dy00);
330 tz = _mm256_mul_pd(fscal,dz00);
332 /* Update vectorial force */
333 fix0 = _mm256_add_pd(fix0,tx);
334 fiy0 = _mm256_add_pd(fiy0,ty);
335 fiz0 = _mm256_add_pd(fiz0,tz);
337 fjx0 = _mm256_add_pd(fjx0,tx);
338 fjy0 = _mm256_add_pd(fjy0,ty);
339 fjz0 = _mm256_add_pd(fjz0,tz);
341 /**************************
342 * CALCULATE INTERACTIONS *
343 **************************/
345 /* COULOMB ELECTROSTATICS */
346 velec = _mm256_mul_pd(qq11,rinv11);
347 felec = _mm256_mul_pd(velec,rinvsq11);
349 /* Update potential sum for this i atom from the interaction with this j atom. */
350 velecsum = _mm256_add_pd(velecsum,velec);
354 /* Calculate temporary vectorial force */
355 tx = _mm256_mul_pd(fscal,dx11);
356 ty = _mm256_mul_pd(fscal,dy11);
357 tz = _mm256_mul_pd(fscal,dz11);
359 /* Update vectorial force */
360 fix1 = _mm256_add_pd(fix1,tx);
361 fiy1 = _mm256_add_pd(fiy1,ty);
362 fiz1 = _mm256_add_pd(fiz1,tz);
364 fjx1 = _mm256_add_pd(fjx1,tx);
365 fjy1 = _mm256_add_pd(fjy1,ty);
366 fjz1 = _mm256_add_pd(fjz1,tz);
368 /**************************
369 * CALCULATE INTERACTIONS *
370 **************************/
372 /* COULOMB ELECTROSTATICS */
373 velec = _mm256_mul_pd(qq12,rinv12);
374 felec = _mm256_mul_pd(velec,rinvsq12);
376 /* Update potential sum for this i atom from the interaction with this j atom. */
377 velecsum = _mm256_add_pd(velecsum,velec);
381 /* Calculate temporary vectorial force */
382 tx = _mm256_mul_pd(fscal,dx12);
383 ty = _mm256_mul_pd(fscal,dy12);
384 tz = _mm256_mul_pd(fscal,dz12);
386 /* Update vectorial force */
387 fix1 = _mm256_add_pd(fix1,tx);
388 fiy1 = _mm256_add_pd(fiy1,ty);
389 fiz1 = _mm256_add_pd(fiz1,tz);
391 fjx2 = _mm256_add_pd(fjx2,tx);
392 fjy2 = _mm256_add_pd(fjy2,ty);
393 fjz2 = _mm256_add_pd(fjz2,tz);
395 /**************************
396 * CALCULATE INTERACTIONS *
397 **************************/
399 /* COULOMB ELECTROSTATICS */
400 velec = _mm256_mul_pd(qq13,rinv13);
401 felec = _mm256_mul_pd(velec,rinvsq13);
403 /* Update potential sum for this i atom from the interaction with this j atom. */
404 velecsum = _mm256_add_pd(velecsum,velec);
408 /* Calculate temporary vectorial force */
409 tx = _mm256_mul_pd(fscal,dx13);
410 ty = _mm256_mul_pd(fscal,dy13);
411 tz = _mm256_mul_pd(fscal,dz13);
413 /* Update vectorial force */
414 fix1 = _mm256_add_pd(fix1,tx);
415 fiy1 = _mm256_add_pd(fiy1,ty);
416 fiz1 = _mm256_add_pd(fiz1,tz);
418 fjx3 = _mm256_add_pd(fjx3,tx);
419 fjy3 = _mm256_add_pd(fjy3,ty);
420 fjz3 = _mm256_add_pd(fjz3,tz);
422 /**************************
423 * CALCULATE INTERACTIONS *
424 **************************/
426 /* COULOMB ELECTROSTATICS */
427 velec = _mm256_mul_pd(qq21,rinv21);
428 felec = _mm256_mul_pd(velec,rinvsq21);
430 /* Update potential sum for this i atom from the interaction with this j atom. */
431 velecsum = _mm256_add_pd(velecsum,velec);
435 /* Calculate temporary vectorial force */
436 tx = _mm256_mul_pd(fscal,dx21);
437 ty = _mm256_mul_pd(fscal,dy21);
438 tz = _mm256_mul_pd(fscal,dz21);
440 /* Update vectorial force */
441 fix2 = _mm256_add_pd(fix2,tx);
442 fiy2 = _mm256_add_pd(fiy2,ty);
443 fiz2 = _mm256_add_pd(fiz2,tz);
445 fjx1 = _mm256_add_pd(fjx1,tx);
446 fjy1 = _mm256_add_pd(fjy1,ty);
447 fjz1 = _mm256_add_pd(fjz1,tz);
449 /**************************
450 * CALCULATE INTERACTIONS *
451 **************************/
453 /* COULOMB ELECTROSTATICS */
454 velec = _mm256_mul_pd(qq22,rinv22);
455 felec = _mm256_mul_pd(velec,rinvsq22);
457 /* Update potential sum for this i atom from the interaction with this j atom. */
458 velecsum = _mm256_add_pd(velecsum,velec);
462 /* Calculate temporary vectorial force */
463 tx = _mm256_mul_pd(fscal,dx22);
464 ty = _mm256_mul_pd(fscal,dy22);
465 tz = _mm256_mul_pd(fscal,dz22);
467 /* Update vectorial force */
468 fix2 = _mm256_add_pd(fix2,tx);
469 fiy2 = _mm256_add_pd(fiy2,ty);
470 fiz2 = _mm256_add_pd(fiz2,tz);
472 fjx2 = _mm256_add_pd(fjx2,tx);
473 fjy2 = _mm256_add_pd(fjy2,ty);
474 fjz2 = _mm256_add_pd(fjz2,tz);
476 /**************************
477 * CALCULATE INTERACTIONS *
478 **************************/
480 /* COULOMB ELECTROSTATICS */
481 velec = _mm256_mul_pd(qq23,rinv23);
482 felec = _mm256_mul_pd(velec,rinvsq23);
484 /* Update potential sum for this i atom from the interaction with this j atom. */
485 velecsum = _mm256_add_pd(velecsum,velec);
489 /* Calculate temporary vectorial force */
490 tx = _mm256_mul_pd(fscal,dx23);
491 ty = _mm256_mul_pd(fscal,dy23);
492 tz = _mm256_mul_pd(fscal,dz23);
494 /* Update vectorial force */
495 fix2 = _mm256_add_pd(fix2,tx);
496 fiy2 = _mm256_add_pd(fiy2,ty);
497 fiz2 = _mm256_add_pd(fiz2,tz);
499 fjx3 = _mm256_add_pd(fjx3,tx);
500 fjy3 = _mm256_add_pd(fjy3,ty);
501 fjz3 = _mm256_add_pd(fjz3,tz);
503 /**************************
504 * CALCULATE INTERACTIONS *
505 **************************/
507 /* COULOMB ELECTROSTATICS */
508 velec = _mm256_mul_pd(qq31,rinv31);
509 felec = _mm256_mul_pd(velec,rinvsq31);
511 /* Update potential sum for this i atom from the interaction with this j atom. */
512 velecsum = _mm256_add_pd(velecsum,velec);
516 /* Calculate temporary vectorial force */
517 tx = _mm256_mul_pd(fscal,dx31);
518 ty = _mm256_mul_pd(fscal,dy31);
519 tz = _mm256_mul_pd(fscal,dz31);
521 /* Update vectorial force */
522 fix3 = _mm256_add_pd(fix3,tx);
523 fiy3 = _mm256_add_pd(fiy3,ty);
524 fiz3 = _mm256_add_pd(fiz3,tz);
526 fjx1 = _mm256_add_pd(fjx1,tx);
527 fjy1 = _mm256_add_pd(fjy1,ty);
528 fjz1 = _mm256_add_pd(fjz1,tz);
530 /**************************
531 * CALCULATE INTERACTIONS *
532 **************************/
534 /* COULOMB ELECTROSTATICS */
535 velec = _mm256_mul_pd(qq32,rinv32);
536 felec = _mm256_mul_pd(velec,rinvsq32);
538 /* Update potential sum for this i atom from the interaction with this j atom. */
539 velecsum = _mm256_add_pd(velecsum,velec);
543 /* Calculate temporary vectorial force */
544 tx = _mm256_mul_pd(fscal,dx32);
545 ty = _mm256_mul_pd(fscal,dy32);
546 tz = _mm256_mul_pd(fscal,dz32);
548 /* Update vectorial force */
549 fix3 = _mm256_add_pd(fix3,tx);
550 fiy3 = _mm256_add_pd(fiy3,ty);
551 fiz3 = _mm256_add_pd(fiz3,tz);
553 fjx2 = _mm256_add_pd(fjx2,tx);
554 fjy2 = _mm256_add_pd(fjy2,ty);
555 fjz2 = _mm256_add_pd(fjz2,tz);
557 /**************************
558 * CALCULATE INTERACTIONS *
559 **************************/
561 /* COULOMB ELECTROSTATICS */
562 velec = _mm256_mul_pd(qq33,rinv33);
563 felec = _mm256_mul_pd(velec,rinvsq33);
565 /* Update potential sum for this i atom from the interaction with this j atom. */
566 velecsum = _mm256_add_pd(velecsum,velec);
570 /* Calculate temporary vectorial force */
571 tx = _mm256_mul_pd(fscal,dx33);
572 ty = _mm256_mul_pd(fscal,dy33);
573 tz = _mm256_mul_pd(fscal,dz33);
575 /* Update vectorial force */
576 fix3 = _mm256_add_pd(fix3,tx);
577 fiy3 = _mm256_add_pd(fiy3,ty);
578 fiz3 = _mm256_add_pd(fiz3,tz);
580 fjx3 = _mm256_add_pd(fjx3,tx);
581 fjy3 = _mm256_add_pd(fjy3,ty);
582 fjz3 = _mm256_add_pd(fjz3,tz);
584 fjptrA = f+j_coord_offsetA;
585 fjptrB = f+j_coord_offsetB;
586 fjptrC = f+j_coord_offsetC;
587 fjptrD = f+j_coord_offsetD;
589 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
590 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
591 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
593 /* Inner loop uses 278 flops */
599 /* Get j neighbor index, and coordinate index */
600 jnrlistA = jjnr[jidx];
601 jnrlistB = jjnr[jidx+1];
602 jnrlistC = jjnr[jidx+2];
603 jnrlistD = jjnr[jidx+3];
604 /* Sign of each element will be negative for non-real atoms.
605 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
606 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
608 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
610 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
611 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
612 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
614 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
615 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
616 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
617 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
618 j_coord_offsetA = DIM*jnrA;
619 j_coord_offsetB = DIM*jnrB;
620 j_coord_offsetC = DIM*jnrC;
621 j_coord_offsetD = DIM*jnrD;
623 /* load j atom coordinates */
624 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
625 x+j_coord_offsetC,x+j_coord_offsetD,
626 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
627 &jy2,&jz2,&jx3,&jy3,&jz3);
629 /* Calculate displacement vector */
630 dx00 = _mm256_sub_pd(ix0,jx0);
631 dy00 = _mm256_sub_pd(iy0,jy0);
632 dz00 = _mm256_sub_pd(iz0,jz0);
633 dx11 = _mm256_sub_pd(ix1,jx1);
634 dy11 = _mm256_sub_pd(iy1,jy1);
635 dz11 = _mm256_sub_pd(iz1,jz1);
636 dx12 = _mm256_sub_pd(ix1,jx2);
637 dy12 = _mm256_sub_pd(iy1,jy2);
638 dz12 = _mm256_sub_pd(iz1,jz2);
639 dx13 = _mm256_sub_pd(ix1,jx3);
640 dy13 = _mm256_sub_pd(iy1,jy3);
641 dz13 = _mm256_sub_pd(iz1,jz3);
642 dx21 = _mm256_sub_pd(ix2,jx1);
643 dy21 = _mm256_sub_pd(iy2,jy1);
644 dz21 = _mm256_sub_pd(iz2,jz1);
645 dx22 = _mm256_sub_pd(ix2,jx2);
646 dy22 = _mm256_sub_pd(iy2,jy2);
647 dz22 = _mm256_sub_pd(iz2,jz2);
648 dx23 = _mm256_sub_pd(ix2,jx3);
649 dy23 = _mm256_sub_pd(iy2,jy3);
650 dz23 = _mm256_sub_pd(iz2,jz3);
651 dx31 = _mm256_sub_pd(ix3,jx1);
652 dy31 = _mm256_sub_pd(iy3,jy1);
653 dz31 = _mm256_sub_pd(iz3,jz1);
654 dx32 = _mm256_sub_pd(ix3,jx2);
655 dy32 = _mm256_sub_pd(iy3,jy2);
656 dz32 = _mm256_sub_pd(iz3,jz2);
657 dx33 = _mm256_sub_pd(ix3,jx3);
658 dy33 = _mm256_sub_pd(iy3,jy3);
659 dz33 = _mm256_sub_pd(iz3,jz3);
661 /* Calculate squared distance and things based on it */
662 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
663 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
664 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
665 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
666 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
667 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
668 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
669 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
670 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
671 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
673 rinv11 = avx256_invsqrt_d(rsq11);
674 rinv12 = avx256_invsqrt_d(rsq12);
675 rinv13 = avx256_invsqrt_d(rsq13);
676 rinv21 = avx256_invsqrt_d(rsq21);
677 rinv22 = avx256_invsqrt_d(rsq22);
678 rinv23 = avx256_invsqrt_d(rsq23);
679 rinv31 = avx256_invsqrt_d(rsq31);
680 rinv32 = avx256_invsqrt_d(rsq32);
681 rinv33 = avx256_invsqrt_d(rsq33);
683 rinvsq00 = avx256_inv_d(rsq00);
684 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
685 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
686 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
687 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
688 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
689 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
690 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
691 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
692 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
694 fjx0 = _mm256_setzero_pd();
695 fjy0 = _mm256_setzero_pd();
696 fjz0 = _mm256_setzero_pd();
697 fjx1 = _mm256_setzero_pd();
698 fjy1 = _mm256_setzero_pd();
699 fjz1 = _mm256_setzero_pd();
700 fjx2 = _mm256_setzero_pd();
701 fjy2 = _mm256_setzero_pd();
702 fjz2 = _mm256_setzero_pd();
703 fjx3 = _mm256_setzero_pd();
704 fjy3 = _mm256_setzero_pd();
705 fjz3 = _mm256_setzero_pd();
707 /**************************
708 * CALCULATE INTERACTIONS *
709 **************************/
711 /* LENNARD-JONES DISPERSION/REPULSION */
713 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
714 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
715 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
716 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
717 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
719 /* Update potential sum for this i atom from the interaction with this j atom. */
720 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
721 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
725 fscal = _mm256_andnot_pd(dummy_mask,fscal);
727 /* Calculate temporary vectorial force */
728 tx = _mm256_mul_pd(fscal,dx00);
729 ty = _mm256_mul_pd(fscal,dy00);
730 tz = _mm256_mul_pd(fscal,dz00);
732 /* Update vectorial force */
733 fix0 = _mm256_add_pd(fix0,tx);
734 fiy0 = _mm256_add_pd(fiy0,ty);
735 fiz0 = _mm256_add_pd(fiz0,tz);
737 fjx0 = _mm256_add_pd(fjx0,tx);
738 fjy0 = _mm256_add_pd(fjy0,ty);
739 fjz0 = _mm256_add_pd(fjz0,tz);
741 /**************************
742 * CALCULATE INTERACTIONS *
743 **************************/
745 /* COULOMB ELECTROSTATICS */
746 velec = _mm256_mul_pd(qq11,rinv11);
747 felec = _mm256_mul_pd(velec,rinvsq11);
749 /* Update potential sum for this i atom from the interaction with this j atom. */
750 velec = _mm256_andnot_pd(dummy_mask,velec);
751 velecsum = _mm256_add_pd(velecsum,velec);
755 fscal = _mm256_andnot_pd(dummy_mask,fscal);
757 /* Calculate temporary vectorial force */
758 tx = _mm256_mul_pd(fscal,dx11);
759 ty = _mm256_mul_pd(fscal,dy11);
760 tz = _mm256_mul_pd(fscal,dz11);
762 /* Update vectorial force */
763 fix1 = _mm256_add_pd(fix1,tx);
764 fiy1 = _mm256_add_pd(fiy1,ty);
765 fiz1 = _mm256_add_pd(fiz1,tz);
767 fjx1 = _mm256_add_pd(fjx1,tx);
768 fjy1 = _mm256_add_pd(fjy1,ty);
769 fjz1 = _mm256_add_pd(fjz1,tz);
771 /**************************
772 * CALCULATE INTERACTIONS *
773 **************************/
775 /* COULOMB ELECTROSTATICS */
776 velec = _mm256_mul_pd(qq12,rinv12);
777 felec = _mm256_mul_pd(velec,rinvsq12);
779 /* Update potential sum for this i atom from the interaction with this j atom. */
780 velec = _mm256_andnot_pd(dummy_mask,velec);
781 velecsum = _mm256_add_pd(velecsum,velec);
785 fscal = _mm256_andnot_pd(dummy_mask,fscal);
787 /* Calculate temporary vectorial force */
788 tx = _mm256_mul_pd(fscal,dx12);
789 ty = _mm256_mul_pd(fscal,dy12);
790 tz = _mm256_mul_pd(fscal,dz12);
792 /* Update vectorial force */
793 fix1 = _mm256_add_pd(fix1,tx);
794 fiy1 = _mm256_add_pd(fiy1,ty);
795 fiz1 = _mm256_add_pd(fiz1,tz);
797 fjx2 = _mm256_add_pd(fjx2,tx);
798 fjy2 = _mm256_add_pd(fjy2,ty);
799 fjz2 = _mm256_add_pd(fjz2,tz);
801 /**************************
802 * CALCULATE INTERACTIONS *
803 **************************/
805 /* COULOMB ELECTROSTATICS */
806 velec = _mm256_mul_pd(qq13,rinv13);
807 felec = _mm256_mul_pd(velec,rinvsq13);
809 /* Update potential sum for this i atom from the interaction with this j atom. */
810 velec = _mm256_andnot_pd(dummy_mask,velec);
811 velecsum = _mm256_add_pd(velecsum,velec);
815 fscal = _mm256_andnot_pd(dummy_mask,fscal);
817 /* Calculate temporary vectorial force */
818 tx = _mm256_mul_pd(fscal,dx13);
819 ty = _mm256_mul_pd(fscal,dy13);
820 tz = _mm256_mul_pd(fscal,dz13);
822 /* Update vectorial force */
823 fix1 = _mm256_add_pd(fix1,tx);
824 fiy1 = _mm256_add_pd(fiy1,ty);
825 fiz1 = _mm256_add_pd(fiz1,tz);
827 fjx3 = _mm256_add_pd(fjx3,tx);
828 fjy3 = _mm256_add_pd(fjy3,ty);
829 fjz3 = _mm256_add_pd(fjz3,tz);
831 /**************************
832 * CALCULATE INTERACTIONS *
833 **************************/
835 /* COULOMB ELECTROSTATICS */
836 velec = _mm256_mul_pd(qq21,rinv21);
837 felec = _mm256_mul_pd(velec,rinvsq21);
839 /* Update potential sum for this i atom from the interaction with this j atom. */
840 velec = _mm256_andnot_pd(dummy_mask,velec);
841 velecsum = _mm256_add_pd(velecsum,velec);
845 fscal = _mm256_andnot_pd(dummy_mask,fscal);
847 /* Calculate temporary vectorial force */
848 tx = _mm256_mul_pd(fscal,dx21);
849 ty = _mm256_mul_pd(fscal,dy21);
850 tz = _mm256_mul_pd(fscal,dz21);
852 /* Update vectorial force */
853 fix2 = _mm256_add_pd(fix2,tx);
854 fiy2 = _mm256_add_pd(fiy2,ty);
855 fiz2 = _mm256_add_pd(fiz2,tz);
857 fjx1 = _mm256_add_pd(fjx1,tx);
858 fjy1 = _mm256_add_pd(fjy1,ty);
859 fjz1 = _mm256_add_pd(fjz1,tz);
861 /**************************
862 * CALCULATE INTERACTIONS *
863 **************************/
865 /* COULOMB ELECTROSTATICS */
866 velec = _mm256_mul_pd(qq22,rinv22);
867 felec = _mm256_mul_pd(velec,rinvsq22);
869 /* Update potential sum for this i atom from the interaction with this j atom. */
870 velec = _mm256_andnot_pd(dummy_mask,velec);
871 velecsum = _mm256_add_pd(velecsum,velec);
875 fscal = _mm256_andnot_pd(dummy_mask,fscal);
877 /* Calculate temporary vectorial force */
878 tx = _mm256_mul_pd(fscal,dx22);
879 ty = _mm256_mul_pd(fscal,dy22);
880 tz = _mm256_mul_pd(fscal,dz22);
882 /* Update vectorial force */
883 fix2 = _mm256_add_pd(fix2,tx);
884 fiy2 = _mm256_add_pd(fiy2,ty);
885 fiz2 = _mm256_add_pd(fiz2,tz);
887 fjx2 = _mm256_add_pd(fjx2,tx);
888 fjy2 = _mm256_add_pd(fjy2,ty);
889 fjz2 = _mm256_add_pd(fjz2,tz);
891 /**************************
892 * CALCULATE INTERACTIONS *
893 **************************/
895 /* COULOMB ELECTROSTATICS */
896 velec = _mm256_mul_pd(qq23,rinv23);
897 felec = _mm256_mul_pd(velec,rinvsq23);
899 /* Update potential sum for this i atom from the interaction with this j atom. */
900 velec = _mm256_andnot_pd(dummy_mask,velec);
901 velecsum = _mm256_add_pd(velecsum,velec);
905 fscal = _mm256_andnot_pd(dummy_mask,fscal);
907 /* Calculate temporary vectorial force */
908 tx = _mm256_mul_pd(fscal,dx23);
909 ty = _mm256_mul_pd(fscal,dy23);
910 tz = _mm256_mul_pd(fscal,dz23);
912 /* Update vectorial force */
913 fix2 = _mm256_add_pd(fix2,tx);
914 fiy2 = _mm256_add_pd(fiy2,ty);
915 fiz2 = _mm256_add_pd(fiz2,tz);
917 fjx3 = _mm256_add_pd(fjx3,tx);
918 fjy3 = _mm256_add_pd(fjy3,ty);
919 fjz3 = _mm256_add_pd(fjz3,tz);
921 /**************************
922 * CALCULATE INTERACTIONS *
923 **************************/
925 /* COULOMB ELECTROSTATICS */
926 velec = _mm256_mul_pd(qq31,rinv31);
927 felec = _mm256_mul_pd(velec,rinvsq31);
929 /* Update potential sum for this i atom from the interaction with this j atom. */
930 velec = _mm256_andnot_pd(dummy_mask,velec);
931 velecsum = _mm256_add_pd(velecsum,velec);
935 fscal = _mm256_andnot_pd(dummy_mask,fscal);
937 /* Calculate temporary vectorial force */
938 tx = _mm256_mul_pd(fscal,dx31);
939 ty = _mm256_mul_pd(fscal,dy31);
940 tz = _mm256_mul_pd(fscal,dz31);
942 /* Update vectorial force */
943 fix3 = _mm256_add_pd(fix3,tx);
944 fiy3 = _mm256_add_pd(fiy3,ty);
945 fiz3 = _mm256_add_pd(fiz3,tz);
947 fjx1 = _mm256_add_pd(fjx1,tx);
948 fjy1 = _mm256_add_pd(fjy1,ty);
949 fjz1 = _mm256_add_pd(fjz1,tz);
951 /**************************
952 * CALCULATE INTERACTIONS *
953 **************************/
955 /* COULOMB ELECTROSTATICS */
956 velec = _mm256_mul_pd(qq32,rinv32);
957 felec = _mm256_mul_pd(velec,rinvsq32);
959 /* Update potential sum for this i atom from the interaction with this j atom. */
960 velec = _mm256_andnot_pd(dummy_mask,velec);
961 velecsum = _mm256_add_pd(velecsum,velec);
965 fscal = _mm256_andnot_pd(dummy_mask,fscal);
967 /* Calculate temporary vectorial force */
968 tx = _mm256_mul_pd(fscal,dx32);
969 ty = _mm256_mul_pd(fscal,dy32);
970 tz = _mm256_mul_pd(fscal,dz32);
972 /* Update vectorial force */
973 fix3 = _mm256_add_pd(fix3,tx);
974 fiy3 = _mm256_add_pd(fiy3,ty);
975 fiz3 = _mm256_add_pd(fiz3,tz);
977 fjx2 = _mm256_add_pd(fjx2,tx);
978 fjy2 = _mm256_add_pd(fjy2,ty);
979 fjz2 = _mm256_add_pd(fjz2,tz);
981 /**************************
982 * CALCULATE INTERACTIONS *
983 **************************/
985 /* COULOMB ELECTROSTATICS */
986 velec = _mm256_mul_pd(qq33,rinv33);
987 felec = _mm256_mul_pd(velec,rinvsq33);
989 /* Update potential sum for this i atom from the interaction with this j atom. */
990 velec = _mm256_andnot_pd(dummy_mask,velec);
991 velecsum = _mm256_add_pd(velecsum,velec);
995 fscal = _mm256_andnot_pd(dummy_mask,fscal);
997 /* Calculate temporary vectorial force */
998 tx = _mm256_mul_pd(fscal,dx33);
999 ty = _mm256_mul_pd(fscal,dy33);
1000 tz = _mm256_mul_pd(fscal,dz33);
1002 /* Update vectorial force */
1003 fix3 = _mm256_add_pd(fix3,tx);
1004 fiy3 = _mm256_add_pd(fiy3,ty);
1005 fiz3 = _mm256_add_pd(fiz3,tz);
1007 fjx3 = _mm256_add_pd(fjx3,tx);
1008 fjy3 = _mm256_add_pd(fjy3,ty);
1009 fjz3 = _mm256_add_pd(fjz3,tz);
1011 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1012 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1013 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1014 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1016 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1017 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1018 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1020 /* Inner loop uses 278 flops */
1023 /* End of innermost loop */
1025 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1026 f+i_coord_offset,fshift+i_shift_offset);
1029 /* Update potential energies */
1030 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1031 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1033 /* Increment number of inner iterations */
1034 inneriter += j_index_end - j_index_start;
1036 /* Outer loop uses 26 flops */
1039 /* Increment number of outer iterations */
1042 /* Update outer/inner flops */
1044 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*278);
1047 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_256_double
1048 * Electrostatics interaction: Coulomb
1049 * VdW interaction: LennardJones
1050 * Geometry: Water4-Water4
1051 * Calculate force/pot: Force
1054 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_256_double
1055 (t_nblist * gmx_restrict nlist,
1056 rvec * gmx_restrict xx,
1057 rvec * gmx_restrict ff,
1058 struct t_forcerec * gmx_restrict fr,
1059 t_mdatoms * gmx_restrict mdatoms,
1060 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
1061 t_nrnb * gmx_restrict nrnb)
1063 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1064 * just 0 for non-waters.
1065 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1066 * jnr indices corresponding to data put in the four positions in the SIMD register.
1068 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1069 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1070 int jnrA,jnrB,jnrC,jnrD;
1071 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1072 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1073 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1074 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1075 real rcutoff_scalar;
1076 real *shiftvec,*fshift,*x,*f;
1077 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1078 real scratch[4*DIM];
1079 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1080 real * vdwioffsetptr0;
1081 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1082 real * vdwioffsetptr1;
1083 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1084 real * vdwioffsetptr2;
1085 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1086 real * vdwioffsetptr3;
1087 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1088 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1089 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1090 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1091 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1092 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1093 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1094 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1095 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1096 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1097 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1098 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1099 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1100 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1101 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1102 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1103 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1104 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1105 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1106 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1109 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1112 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1113 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1114 __m256d dummy_mask,cutoff_mask;
1115 __m128 tmpmask0,tmpmask1;
1116 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1117 __m256d one = _mm256_set1_pd(1.0);
1118 __m256d two = _mm256_set1_pd(2.0);
1124 jindex = nlist->jindex;
1126 shiftidx = nlist->shift;
1128 shiftvec = fr->shift_vec[0];
1129 fshift = fr->fshift[0];
1130 facel = _mm256_set1_pd(fr->ic->epsfac);
1131 charge = mdatoms->chargeA;
1132 nvdwtype = fr->ntype;
1133 vdwparam = fr->nbfp;
1134 vdwtype = mdatoms->typeA;
1136 /* Setup water-specific parameters */
1137 inr = nlist->iinr[0];
1138 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1139 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1140 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1141 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1143 jq1 = _mm256_set1_pd(charge[inr+1]);
1144 jq2 = _mm256_set1_pd(charge[inr+2]);
1145 jq3 = _mm256_set1_pd(charge[inr+3]);
1146 vdwjidx0A = 2*vdwtype[inr+0];
1147 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1148 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1149 qq11 = _mm256_mul_pd(iq1,jq1);
1150 qq12 = _mm256_mul_pd(iq1,jq2);
1151 qq13 = _mm256_mul_pd(iq1,jq3);
1152 qq21 = _mm256_mul_pd(iq2,jq1);
1153 qq22 = _mm256_mul_pd(iq2,jq2);
1154 qq23 = _mm256_mul_pd(iq2,jq3);
1155 qq31 = _mm256_mul_pd(iq3,jq1);
1156 qq32 = _mm256_mul_pd(iq3,jq2);
1157 qq33 = _mm256_mul_pd(iq3,jq3);
1159 /* Avoid stupid compiler warnings */
1160 jnrA = jnrB = jnrC = jnrD = 0;
1161 j_coord_offsetA = 0;
1162 j_coord_offsetB = 0;
1163 j_coord_offsetC = 0;
1164 j_coord_offsetD = 0;
1169 for(iidx=0;iidx<4*DIM;iidx++)
1171 scratch[iidx] = 0.0;
1174 /* Start outer loop over neighborlists */
1175 for(iidx=0; iidx<nri; iidx++)
1177 /* Load shift vector for this list */
1178 i_shift_offset = DIM*shiftidx[iidx];
1180 /* Load limits for loop over neighbors */
1181 j_index_start = jindex[iidx];
1182 j_index_end = jindex[iidx+1];
1184 /* Get outer coordinate index */
1186 i_coord_offset = DIM*inr;
1188 /* Load i particle coords and add shift vector */
1189 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1190 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1192 fix0 = _mm256_setzero_pd();
1193 fiy0 = _mm256_setzero_pd();
1194 fiz0 = _mm256_setzero_pd();
1195 fix1 = _mm256_setzero_pd();
1196 fiy1 = _mm256_setzero_pd();
1197 fiz1 = _mm256_setzero_pd();
1198 fix2 = _mm256_setzero_pd();
1199 fiy2 = _mm256_setzero_pd();
1200 fiz2 = _mm256_setzero_pd();
1201 fix3 = _mm256_setzero_pd();
1202 fiy3 = _mm256_setzero_pd();
1203 fiz3 = _mm256_setzero_pd();
1205 /* Start inner kernel loop */
1206 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1209 /* Get j neighbor index, and coordinate index */
1211 jnrB = jjnr[jidx+1];
1212 jnrC = jjnr[jidx+2];
1213 jnrD = jjnr[jidx+3];
1214 j_coord_offsetA = DIM*jnrA;
1215 j_coord_offsetB = DIM*jnrB;
1216 j_coord_offsetC = DIM*jnrC;
1217 j_coord_offsetD = DIM*jnrD;
1219 /* load j atom coordinates */
1220 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1221 x+j_coord_offsetC,x+j_coord_offsetD,
1222 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1223 &jy2,&jz2,&jx3,&jy3,&jz3);
1225 /* Calculate displacement vector */
1226 dx00 = _mm256_sub_pd(ix0,jx0);
1227 dy00 = _mm256_sub_pd(iy0,jy0);
1228 dz00 = _mm256_sub_pd(iz0,jz0);
1229 dx11 = _mm256_sub_pd(ix1,jx1);
1230 dy11 = _mm256_sub_pd(iy1,jy1);
1231 dz11 = _mm256_sub_pd(iz1,jz1);
1232 dx12 = _mm256_sub_pd(ix1,jx2);
1233 dy12 = _mm256_sub_pd(iy1,jy2);
1234 dz12 = _mm256_sub_pd(iz1,jz2);
1235 dx13 = _mm256_sub_pd(ix1,jx3);
1236 dy13 = _mm256_sub_pd(iy1,jy3);
1237 dz13 = _mm256_sub_pd(iz1,jz3);
1238 dx21 = _mm256_sub_pd(ix2,jx1);
1239 dy21 = _mm256_sub_pd(iy2,jy1);
1240 dz21 = _mm256_sub_pd(iz2,jz1);
1241 dx22 = _mm256_sub_pd(ix2,jx2);
1242 dy22 = _mm256_sub_pd(iy2,jy2);
1243 dz22 = _mm256_sub_pd(iz2,jz2);
1244 dx23 = _mm256_sub_pd(ix2,jx3);
1245 dy23 = _mm256_sub_pd(iy2,jy3);
1246 dz23 = _mm256_sub_pd(iz2,jz3);
1247 dx31 = _mm256_sub_pd(ix3,jx1);
1248 dy31 = _mm256_sub_pd(iy3,jy1);
1249 dz31 = _mm256_sub_pd(iz3,jz1);
1250 dx32 = _mm256_sub_pd(ix3,jx2);
1251 dy32 = _mm256_sub_pd(iy3,jy2);
1252 dz32 = _mm256_sub_pd(iz3,jz2);
1253 dx33 = _mm256_sub_pd(ix3,jx3);
1254 dy33 = _mm256_sub_pd(iy3,jy3);
1255 dz33 = _mm256_sub_pd(iz3,jz3);
1257 /* Calculate squared distance and things based on it */
1258 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1259 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1260 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1261 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1262 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1263 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1264 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1265 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1266 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1267 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1269 rinv11 = avx256_invsqrt_d(rsq11);
1270 rinv12 = avx256_invsqrt_d(rsq12);
1271 rinv13 = avx256_invsqrt_d(rsq13);
1272 rinv21 = avx256_invsqrt_d(rsq21);
1273 rinv22 = avx256_invsqrt_d(rsq22);
1274 rinv23 = avx256_invsqrt_d(rsq23);
1275 rinv31 = avx256_invsqrt_d(rsq31);
1276 rinv32 = avx256_invsqrt_d(rsq32);
1277 rinv33 = avx256_invsqrt_d(rsq33);
1279 rinvsq00 = avx256_inv_d(rsq00);
1280 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1281 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1282 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1283 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1284 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1285 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1286 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1287 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1288 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1290 fjx0 = _mm256_setzero_pd();
1291 fjy0 = _mm256_setzero_pd();
1292 fjz0 = _mm256_setzero_pd();
1293 fjx1 = _mm256_setzero_pd();
1294 fjy1 = _mm256_setzero_pd();
1295 fjz1 = _mm256_setzero_pd();
1296 fjx2 = _mm256_setzero_pd();
1297 fjy2 = _mm256_setzero_pd();
1298 fjz2 = _mm256_setzero_pd();
1299 fjx3 = _mm256_setzero_pd();
1300 fjy3 = _mm256_setzero_pd();
1301 fjz3 = _mm256_setzero_pd();
1303 /**************************
1304 * CALCULATE INTERACTIONS *
1305 **************************/
1307 /* LENNARD-JONES DISPERSION/REPULSION */
1309 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1310 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1314 /* Calculate temporary vectorial force */
1315 tx = _mm256_mul_pd(fscal,dx00);
1316 ty = _mm256_mul_pd(fscal,dy00);
1317 tz = _mm256_mul_pd(fscal,dz00);
1319 /* Update vectorial force */
1320 fix0 = _mm256_add_pd(fix0,tx);
1321 fiy0 = _mm256_add_pd(fiy0,ty);
1322 fiz0 = _mm256_add_pd(fiz0,tz);
1324 fjx0 = _mm256_add_pd(fjx0,tx);
1325 fjy0 = _mm256_add_pd(fjy0,ty);
1326 fjz0 = _mm256_add_pd(fjz0,tz);
1328 /**************************
1329 * CALCULATE INTERACTIONS *
1330 **************************/
1332 /* COULOMB ELECTROSTATICS */
1333 velec = _mm256_mul_pd(qq11,rinv11);
1334 felec = _mm256_mul_pd(velec,rinvsq11);
1338 /* Calculate temporary vectorial force */
1339 tx = _mm256_mul_pd(fscal,dx11);
1340 ty = _mm256_mul_pd(fscal,dy11);
1341 tz = _mm256_mul_pd(fscal,dz11);
1343 /* Update vectorial force */
1344 fix1 = _mm256_add_pd(fix1,tx);
1345 fiy1 = _mm256_add_pd(fiy1,ty);
1346 fiz1 = _mm256_add_pd(fiz1,tz);
1348 fjx1 = _mm256_add_pd(fjx1,tx);
1349 fjy1 = _mm256_add_pd(fjy1,ty);
1350 fjz1 = _mm256_add_pd(fjz1,tz);
1352 /**************************
1353 * CALCULATE INTERACTIONS *
1354 **************************/
1356 /* COULOMB ELECTROSTATICS */
1357 velec = _mm256_mul_pd(qq12,rinv12);
1358 felec = _mm256_mul_pd(velec,rinvsq12);
1362 /* Calculate temporary vectorial force */
1363 tx = _mm256_mul_pd(fscal,dx12);
1364 ty = _mm256_mul_pd(fscal,dy12);
1365 tz = _mm256_mul_pd(fscal,dz12);
1367 /* Update vectorial force */
1368 fix1 = _mm256_add_pd(fix1,tx);
1369 fiy1 = _mm256_add_pd(fiy1,ty);
1370 fiz1 = _mm256_add_pd(fiz1,tz);
1372 fjx2 = _mm256_add_pd(fjx2,tx);
1373 fjy2 = _mm256_add_pd(fjy2,ty);
1374 fjz2 = _mm256_add_pd(fjz2,tz);
1376 /**************************
1377 * CALCULATE INTERACTIONS *
1378 **************************/
1380 /* COULOMB ELECTROSTATICS */
1381 velec = _mm256_mul_pd(qq13,rinv13);
1382 felec = _mm256_mul_pd(velec,rinvsq13);
1386 /* Calculate temporary vectorial force */
1387 tx = _mm256_mul_pd(fscal,dx13);
1388 ty = _mm256_mul_pd(fscal,dy13);
1389 tz = _mm256_mul_pd(fscal,dz13);
1391 /* Update vectorial force */
1392 fix1 = _mm256_add_pd(fix1,tx);
1393 fiy1 = _mm256_add_pd(fiy1,ty);
1394 fiz1 = _mm256_add_pd(fiz1,tz);
1396 fjx3 = _mm256_add_pd(fjx3,tx);
1397 fjy3 = _mm256_add_pd(fjy3,ty);
1398 fjz3 = _mm256_add_pd(fjz3,tz);
1400 /**************************
1401 * CALCULATE INTERACTIONS *
1402 **************************/
1404 /* COULOMB ELECTROSTATICS */
1405 velec = _mm256_mul_pd(qq21,rinv21);
1406 felec = _mm256_mul_pd(velec,rinvsq21);
1410 /* Calculate temporary vectorial force */
1411 tx = _mm256_mul_pd(fscal,dx21);
1412 ty = _mm256_mul_pd(fscal,dy21);
1413 tz = _mm256_mul_pd(fscal,dz21);
1415 /* Update vectorial force */
1416 fix2 = _mm256_add_pd(fix2,tx);
1417 fiy2 = _mm256_add_pd(fiy2,ty);
1418 fiz2 = _mm256_add_pd(fiz2,tz);
1420 fjx1 = _mm256_add_pd(fjx1,tx);
1421 fjy1 = _mm256_add_pd(fjy1,ty);
1422 fjz1 = _mm256_add_pd(fjz1,tz);
1424 /**************************
1425 * CALCULATE INTERACTIONS *
1426 **************************/
1428 /* COULOMB ELECTROSTATICS */
1429 velec = _mm256_mul_pd(qq22,rinv22);
1430 felec = _mm256_mul_pd(velec,rinvsq22);
1434 /* Calculate temporary vectorial force */
1435 tx = _mm256_mul_pd(fscal,dx22);
1436 ty = _mm256_mul_pd(fscal,dy22);
1437 tz = _mm256_mul_pd(fscal,dz22);
1439 /* Update vectorial force */
1440 fix2 = _mm256_add_pd(fix2,tx);
1441 fiy2 = _mm256_add_pd(fiy2,ty);
1442 fiz2 = _mm256_add_pd(fiz2,tz);
1444 fjx2 = _mm256_add_pd(fjx2,tx);
1445 fjy2 = _mm256_add_pd(fjy2,ty);
1446 fjz2 = _mm256_add_pd(fjz2,tz);
1448 /**************************
1449 * CALCULATE INTERACTIONS *
1450 **************************/
1452 /* COULOMB ELECTROSTATICS */
1453 velec = _mm256_mul_pd(qq23,rinv23);
1454 felec = _mm256_mul_pd(velec,rinvsq23);
1458 /* Calculate temporary vectorial force */
1459 tx = _mm256_mul_pd(fscal,dx23);
1460 ty = _mm256_mul_pd(fscal,dy23);
1461 tz = _mm256_mul_pd(fscal,dz23);
1463 /* Update vectorial force */
1464 fix2 = _mm256_add_pd(fix2,tx);
1465 fiy2 = _mm256_add_pd(fiy2,ty);
1466 fiz2 = _mm256_add_pd(fiz2,tz);
1468 fjx3 = _mm256_add_pd(fjx3,tx);
1469 fjy3 = _mm256_add_pd(fjy3,ty);
1470 fjz3 = _mm256_add_pd(fjz3,tz);
1472 /**************************
1473 * CALCULATE INTERACTIONS *
1474 **************************/
1476 /* COULOMB ELECTROSTATICS */
1477 velec = _mm256_mul_pd(qq31,rinv31);
1478 felec = _mm256_mul_pd(velec,rinvsq31);
1482 /* Calculate temporary vectorial force */
1483 tx = _mm256_mul_pd(fscal,dx31);
1484 ty = _mm256_mul_pd(fscal,dy31);
1485 tz = _mm256_mul_pd(fscal,dz31);
1487 /* Update vectorial force */
1488 fix3 = _mm256_add_pd(fix3,tx);
1489 fiy3 = _mm256_add_pd(fiy3,ty);
1490 fiz3 = _mm256_add_pd(fiz3,tz);
1492 fjx1 = _mm256_add_pd(fjx1,tx);
1493 fjy1 = _mm256_add_pd(fjy1,ty);
1494 fjz1 = _mm256_add_pd(fjz1,tz);
1496 /**************************
1497 * CALCULATE INTERACTIONS *
1498 **************************/
1500 /* COULOMB ELECTROSTATICS */
1501 velec = _mm256_mul_pd(qq32,rinv32);
1502 felec = _mm256_mul_pd(velec,rinvsq32);
1506 /* Calculate temporary vectorial force */
1507 tx = _mm256_mul_pd(fscal,dx32);
1508 ty = _mm256_mul_pd(fscal,dy32);
1509 tz = _mm256_mul_pd(fscal,dz32);
1511 /* Update vectorial force */
1512 fix3 = _mm256_add_pd(fix3,tx);
1513 fiy3 = _mm256_add_pd(fiy3,ty);
1514 fiz3 = _mm256_add_pd(fiz3,tz);
1516 fjx2 = _mm256_add_pd(fjx2,tx);
1517 fjy2 = _mm256_add_pd(fjy2,ty);
1518 fjz2 = _mm256_add_pd(fjz2,tz);
1520 /**************************
1521 * CALCULATE INTERACTIONS *
1522 **************************/
1524 /* COULOMB ELECTROSTATICS */
1525 velec = _mm256_mul_pd(qq33,rinv33);
1526 felec = _mm256_mul_pd(velec,rinvsq33);
1530 /* Calculate temporary vectorial force */
1531 tx = _mm256_mul_pd(fscal,dx33);
1532 ty = _mm256_mul_pd(fscal,dy33);
1533 tz = _mm256_mul_pd(fscal,dz33);
1535 /* Update vectorial force */
1536 fix3 = _mm256_add_pd(fix3,tx);
1537 fiy3 = _mm256_add_pd(fiy3,ty);
1538 fiz3 = _mm256_add_pd(fiz3,tz);
1540 fjx3 = _mm256_add_pd(fjx3,tx);
1541 fjy3 = _mm256_add_pd(fjy3,ty);
1542 fjz3 = _mm256_add_pd(fjz3,tz);
1544 fjptrA = f+j_coord_offsetA;
1545 fjptrB = f+j_coord_offsetB;
1546 fjptrC = f+j_coord_offsetC;
1547 fjptrD = f+j_coord_offsetD;
1549 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1550 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1551 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1553 /* Inner loop uses 264 flops */
1556 if(jidx<j_index_end)
1559 /* Get j neighbor index, and coordinate index */
1560 jnrlistA = jjnr[jidx];
1561 jnrlistB = jjnr[jidx+1];
1562 jnrlistC = jjnr[jidx+2];
1563 jnrlistD = jjnr[jidx+3];
1564 /* Sign of each element will be negative for non-real atoms.
1565 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1566 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1568 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1570 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1571 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1572 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1574 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1575 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1576 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1577 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1578 j_coord_offsetA = DIM*jnrA;
1579 j_coord_offsetB = DIM*jnrB;
1580 j_coord_offsetC = DIM*jnrC;
1581 j_coord_offsetD = DIM*jnrD;
1583 /* load j atom coordinates */
1584 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1585 x+j_coord_offsetC,x+j_coord_offsetD,
1586 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1587 &jy2,&jz2,&jx3,&jy3,&jz3);
1589 /* Calculate displacement vector */
1590 dx00 = _mm256_sub_pd(ix0,jx0);
1591 dy00 = _mm256_sub_pd(iy0,jy0);
1592 dz00 = _mm256_sub_pd(iz0,jz0);
1593 dx11 = _mm256_sub_pd(ix1,jx1);
1594 dy11 = _mm256_sub_pd(iy1,jy1);
1595 dz11 = _mm256_sub_pd(iz1,jz1);
1596 dx12 = _mm256_sub_pd(ix1,jx2);
1597 dy12 = _mm256_sub_pd(iy1,jy2);
1598 dz12 = _mm256_sub_pd(iz1,jz2);
1599 dx13 = _mm256_sub_pd(ix1,jx3);
1600 dy13 = _mm256_sub_pd(iy1,jy3);
1601 dz13 = _mm256_sub_pd(iz1,jz3);
1602 dx21 = _mm256_sub_pd(ix2,jx1);
1603 dy21 = _mm256_sub_pd(iy2,jy1);
1604 dz21 = _mm256_sub_pd(iz2,jz1);
1605 dx22 = _mm256_sub_pd(ix2,jx2);
1606 dy22 = _mm256_sub_pd(iy2,jy2);
1607 dz22 = _mm256_sub_pd(iz2,jz2);
1608 dx23 = _mm256_sub_pd(ix2,jx3);
1609 dy23 = _mm256_sub_pd(iy2,jy3);
1610 dz23 = _mm256_sub_pd(iz2,jz3);
1611 dx31 = _mm256_sub_pd(ix3,jx1);
1612 dy31 = _mm256_sub_pd(iy3,jy1);
1613 dz31 = _mm256_sub_pd(iz3,jz1);
1614 dx32 = _mm256_sub_pd(ix3,jx2);
1615 dy32 = _mm256_sub_pd(iy3,jy2);
1616 dz32 = _mm256_sub_pd(iz3,jz2);
1617 dx33 = _mm256_sub_pd(ix3,jx3);
1618 dy33 = _mm256_sub_pd(iy3,jy3);
1619 dz33 = _mm256_sub_pd(iz3,jz3);
1621 /* Calculate squared distance and things based on it */
1622 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1623 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1624 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1625 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1626 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1627 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1628 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1629 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1630 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1631 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1633 rinv11 = avx256_invsqrt_d(rsq11);
1634 rinv12 = avx256_invsqrt_d(rsq12);
1635 rinv13 = avx256_invsqrt_d(rsq13);
1636 rinv21 = avx256_invsqrt_d(rsq21);
1637 rinv22 = avx256_invsqrt_d(rsq22);
1638 rinv23 = avx256_invsqrt_d(rsq23);
1639 rinv31 = avx256_invsqrt_d(rsq31);
1640 rinv32 = avx256_invsqrt_d(rsq32);
1641 rinv33 = avx256_invsqrt_d(rsq33);
1643 rinvsq00 = avx256_inv_d(rsq00);
1644 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1645 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1646 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1647 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1648 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1649 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1650 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1651 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1652 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1654 fjx0 = _mm256_setzero_pd();
1655 fjy0 = _mm256_setzero_pd();
1656 fjz0 = _mm256_setzero_pd();
1657 fjx1 = _mm256_setzero_pd();
1658 fjy1 = _mm256_setzero_pd();
1659 fjz1 = _mm256_setzero_pd();
1660 fjx2 = _mm256_setzero_pd();
1661 fjy2 = _mm256_setzero_pd();
1662 fjz2 = _mm256_setzero_pd();
1663 fjx3 = _mm256_setzero_pd();
1664 fjy3 = _mm256_setzero_pd();
1665 fjz3 = _mm256_setzero_pd();
1667 /**************************
1668 * CALCULATE INTERACTIONS *
1669 **************************/
1671 /* LENNARD-JONES DISPERSION/REPULSION */
1673 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1674 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1678 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1680 /* Calculate temporary vectorial force */
1681 tx = _mm256_mul_pd(fscal,dx00);
1682 ty = _mm256_mul_pd(fscal,dy00);
1683 tz = _mm256_mul_pd(fscal,dz00);
1685 /* Update vectorial force */
1686 fix0 = _mm256_add_pd(fix0,tx);
1687 fiy0 = _mm256_add_pd(fiy0,ty);
1688 fiz0 = _mm256_add_pd(fiz0,tz);
1690 fjx0 = _mm256_add_pd(fjx0,tx);
1691 fjy0 = _mm256_add_pd(fjy0,ty);
1692 fjz0 = _mm256_add_pd(fjz0,tz);
1694 /**************************
1695 * CALCULATE INTERACTIONS *
1696 **************************/
1698 /* COULOMB ELECTROSTATICS */
1699 velec = _mm256_mul_pd(qq11,rinv11);
1700 felec = _mm256_mul_pd(velec,rinvsq11);
1704 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1706 /* Calculate temporary vectorial force */
1707 tx = _mm256_mul_pd(fscal,dx11);
1708 ty = _mm256_mul_pd(fscal,dy11);
1709 tz = _mm256_mul_pd(fscal,dz11);
1711 /* Update vectorial force */
1712 fix1 = _mm256_add_pd(fix1,tx);
1713 fiy1 = _mm256_add_pd(fiy1,ty);
1714 fiz1 = _mm256_add_pd(fiz1,tz);
1716 fjx1 = _mm256_add_pd(fjx1,tx);
1717 fjy1 = _mm256_add_pd(fjy1,ty);
1718 fjz1 = _mm256_add_pd(fjz1,tz);
1720 /**************************
1721 * CALCULATE INTERACTIONS *
1722 **************************/
1724 /* COULOMB ELECTROSTATICS */
1725 velec = _mm256_mul_pd(qq12,rinv12);
1726 felec = _mm256_mul_pd(velec,rinvsq12);
1730 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1732 /* Calculate temporary vectorial force */
1733 tx = _mm256_mul_pd(fscal,dx12);
1734 ty = _mm256_mul_pd(fscal,dy12);
1735 tz = _mm256_mul_pd(fscal,dz12);
1737 /* Update vectorial force */
1738 fix1 = _mm256_add_pd(fix1,tx);
1739 fiy1 = _mm256_add_pd(fiy1,ty);
1740 fiz1 = _mm256_add_pd(fiz1,tz);
1742 fjx2 = _mm256_add_pd(fjx2,tx);
1743 fjy2 = _mm256_add_pd(fjy2,ty);
1744 fjz2 = _mm256_add_pd(fjz2,tz);
1746 /**************************
1747 * CALCULATE INTERACTIONS *
1748 **************************/
1750 /* COULOMB ELECTROSTATICS */
1751 velec = _mm256_mul_pd(qq13,rinv13);
1752 felec = _mm256_mul_pd(velec,rinvsq13);
1756 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1758 /* Calculate temporary vectorial force */
1759 tx = _mm256_mul_pd(fscal,dx13);
1760 ty = _mm256_mul_pd(fscal,dy13);
1761 tz = _mm256_mul_pd(fscal,dz13);
1763 /* Update vectorial force */
1764 fix1 = _mm256_add_pd(fix1,tx);
1765 fiy1 = _mm256_add_pd(fiy1,ty);
1766 fiz1 = _mm256_add_pd(fiz1,tz);
1768 fjx3 = _mm256_add_pd(fjx3,tx);
1769 fjy3 = _mm256_add_pd(fjy3,ty);
1770 fjz3 = _mm256_add_pd(fjz3,tz);
1772 /**************************
1773 * CALCULATE INTERACTIONS *
1774 **************************/
1776 /* COULOMB ELECTROSTATICS */
1777 velec = _mm256_mul_pd(qq21,rinv21);
1778 felec = _mm256_mul_pd(velec,rinvsq21);
1782 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1784 /* Calculate temporary vectorial force */
1785 tx = _mm256_mul_pd(fscal,dx21);
1786 ty = _mm256_mul_pd(fscal,dy21);
1787 tz = _mm256_mul_pd(fscal,dz21);
1789 /* Update vectorial force */
1790 fix2 = _mm256_add_pd(fix2,tx);
1791 fiy2 = _mm256_add_pd(fiy2,ty);
1792 fiz2 = _mm256_add_pd(fiz2,tz);
1794 fjx1 = _mm256_add_pd(fjx1,tx);
1795 fjy1 = _mm256_add_pd(fjy1,ty);
1796 fjz1 = _mm256_add_pd(fjz1,tz);
1798 /**************************
1799 * CALCULATE INTERACTIONS *
1800 **************************/
1802 /* COULOMB ELECTROSTATICS */
1803 velec = _mm256_mul_pd(qq22,rinv22);
1804 felec = _mm256_mul_pd(velec,rinvsq22);
1808 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1810 /* Calculate temporary vectorial force */
1811 tx = _mm256_mul_pd(fscal,dx22);
1812 ty = _mm256_mul_pd(fscal,dy22);
1813 tz = _mm256_mul_pd(fscal,dz22);
1815 /* Update vectorial force */
1816 fix2 = _mm256_add_pd(fix2,tx);
1817 fiy2 = _mm256_add_pd(fiy2,ty);
1818 fiz2 = _mm256_add_pd(fiz2,tz);
1820 fjx2 = _mm256_add_pd(fjx2,tx);
1821 fjy2 = _mm256_add_pd(fjy2,ty);
1822 fjz2 = _mm256_add_pd(fjz2,tz);
1824 /**************************
1825 * CALCULATE INTERACTIONS *
1826 **************************/
1828 /* COULOMB ELECTROSTATICS */
1829 velec = _mm256_mul_pd(qq23,rinv23);
1830 felec = _mm256_mul_pd(velec,rinvsq23);
1834 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1836 /* Calculate temporary vectorial force */
1837 tx = _mm256_mul_pd(fscal,dx23);
1838 ty = _mm256_mul_pd(fscal,dy23);
1839 tz = _mm256_mul_pd(fscal,dz23);
1841 /* Update vectorial force */
1842 fix2 = _mm256_add_pd(fix2,tx);
1843 fiy2 = _mm256_add_pd(fiy2,ty);
1844 fiz2 = _mm256_add_pd(fiz2,tz);
1846 fjx3 = _mm256_add_pd(fjx3,tx);
1847 fjy3 = _mm256_add_pd(fjy3,ty);
1848 fjz3 = _mm256_add_pd(fjz3,tz);
1850 /**************************
1851 * CALCULATE INTERACTIONS *
1852 **************************/
1854 /* COULOMB ELECTROSTATICS */
1855 velec = _mm256_mul_pd(qq31,rinv31);
1856 felec = _mm256_mul_pd(velec,rinvsq31);
1860 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1862 /* Calculate temporary vectorial force */
1863 tx = _mm256_mul_pd(fscal,dx31);
1864 ty = _mm256_mul_pd(fscal,dy31);
1865 tz = _mm256_mul_pd(fscal,dz31);
1867 /* Update vectorial force */
1868 fix3 = _mm256_add_pd(fix3,tx);
1869 fiy3 = _mm256_add_pd(fiy3,ty);
1870 fiz3 = _mm256_add_pd(fiz3,tz);
1872 fjx1 = _mm256_add_pd(fjx1,tx);
1873 fjy1 = _mm256_add_pd(fjy1,ty);
1874 fjz1 = _mm256_add_pd(fjz1,tz);
1876 /**************************
1877 * CALCULATE INTERACTIONS *
1878 **************************/
1880 /* COULOMB ELECTROSTATICS */
1881 velec = _mm256_mul_pd(qq32,rinv32);
1882 felec = _mm256_mul_pd(velec,rinvsq32);
1886 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1888 /* Calculate temporary vectorial force */
1889 tx = _mm256_mul_pd(fscal,dx32);
1890 ty = _mm256_mul_pd(fscal,dy32);
1891 tz = _mm256_mul_pd(fscal,dz32);
1893 /* Update vectorial force */
1894 fix3 = _mm256_add_pd(fix3,tx);
1895 fiy3 = _mm256_add_pd(fiy3,ty);
1896 fiz3 = _mm256_add_pd(fiz3,tz);
1898 fjx2 = _mm256_add_pd(fjx2,tx);
1899 fjy2 = _mm256_add_pd(fjy2,ty);
1900 fjz2 = _mm256_add_pd(fjz2,tz);
1902 /**************************
1903 * CALCULATE INTERACTIONS *
1904 **************************/
1906 /* COULOMB ELECTROSTATICS */
1907 velec = _mm256_mul_pd(qq33,rinv33);
1908 felec = _mm256_mul_pd(velec,rinvsq33);
1912 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1914 /* Calculate temporary vectorial force */
1915 tx = _mm256_mul_pd(fscal,dx33);
1916 ty = _mm256_mul_pd(fscal,dy33);
1917 tz = _mm256_mul_pd(fscal,dz33);
1919 /* Update vectorial force */
1920 fix3 = _mm256_add_pd(fix3,tx);
1921 fiy3 = _mm256_add_pd(fiy3,ty);
1922 fiz3 = _mm256_add_pd(fiz3,tz);
1924 fjx3 = _mm256_add_pd(fjx3,tx);
1925 fjy3 = _mm256_add_pd(fjy3,ty);
1926 fjz3 = _mm256_add_pd(fjz3,tz);
1928 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1929 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1930 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1931 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1933 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1934 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1935 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1937 /* Inner loop uses 264 flops */
1940 /* End of innermost loop */
1942 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1943 f+i_coord_offset,fshift+i_shift_offset);
1945 /* Increment number of inner iterations */
1946 inneriter += j_index_end - j_index_start;
1948 /* Outer loop uses 24 flops */
1951 /* Increment number of outer iterations */
1954 /* Update outer/inner flops */
1956 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*264);