2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_256_double
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Water4
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
81 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
82 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
83 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
84 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
85 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
86 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
87 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
88 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
89 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
90 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
91 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
92 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
93 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
94 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
95 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
96 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
99 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
102 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
103 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
104 __m256d dummy_mask,cutoff_mask;
105 __m128 tmpmask0,tmpmask1;
106 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
107 __m256d one = _mm256_set1_pd(1.0);
108 __m256d two = _mm256_set1_pd(2.0);
114 jindex = nlist->jindex;
116 shiftidx = nlist->shift;
118 shiftvec = fr->shift_vec[0];
119 fshift = fr->fshift[0];
120 facel = _mm256_set1_pd(fr->epsfac);
121 charge = mdatoms->chargeA;
122 nvdwtype = fr->ntype;
124 vdwtype = mdatoms->typeA;
126 /* Setup water-specific parameters */
127 inr = nlist->iinr[0];
128 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
129 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
130 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
131 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
133 jq1 = _mm256_set1_pd(charge[inr+1]);
134 jq2 = _mm256_set1_pd(charge[inr+2]);
135 jq3 = _mm256_set1_pd(charge[inr+3]);
136 vdwjidx0A = 2*vdwtype[inr+0];
137 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
138 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
139 qq11 = _mm256_mul_pd(iq1,jq1);
140 qq12 = _mm256_mul_pd(iq1,jq2);
141 qq13 = _mm256_mul_pd(iq1,jq3);
142 qq21 = _mm256_mul_pd(iq2,jq1);
143 qq22 = _mm256_mul_pd(iq2,jq2);
144 qq23 = _mm256_mul_pd(iq2,jq3);
145 qq31 = _mm256_mul_pd(iq3,jq1);
146 qq32 = _mm256_mul_pd(iq3,jq2);
147 qq33 = _mm256_mul_pd(iq3,jq3);
149 /* Avoid stupid compiler warnings */
150 jnrA = jnrB = jnrC = jnrD = 0;
159 for(iidx=0;iidx<4*DIM;iidx++)
164 /* Start outer loop over neighborlists */
165 for(iidx=0; iidx<nri; iidx++)
167 /* Load shift vector for this list */
168 i_shift_offset = DIM*shiftidx[iidx];
170 /* Load limits for loop over neighbors */
171 j_index_start = jindex[iidx];
172 j_index_end = jindex[iidx+1];
174 /* Get outer coordinate index */
176 i_coord_offset = DIM*inr;
178 /* Load i particle coords and add shift vector */
179 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
180 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
182 fix0 = _mm256_setzero_pd();
183 fiy0 = _mm256_setzero_pd();
184 fiz0 = _mm256_setzero_pd();
185 fix1 = _mm256_setzero_pd();
186 fiy1 = _mm256_setzero_pd();
187 fiz1 = _mm256_setzero_pd();
188 fix2 = _mm256_setzero_pd();
189 fiy2 = _mm256_setzero_pd();
190 fiz2 = _mm256_setzero_pd();
191 fix3 = _mm256_setzero_pd();
192 fiy3 = _mm256_setzero_pd();
193 fiz3 = _mm256_setzero_pd();
195 /* Reset potential sums */
196 velecsum = _mm256_setzero_pd();
197 vvdwsum = _mm256_setzero_pd();
199 /* Start inner kernel loop */
200 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
203 /* Get j neighbor index, and coordinate index */
208 j_coord_offsetA = DIM*jnrA;
209 j_coord_offsetB = DIM*jnrB;
210 j_coord_offsetC = DIM*jnrC;
211 j_coord_offsetD = DIM*jnrD;
213 /* load j atom coordinates */
214 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
215 x+j_coord_offsetC,x+j_coord_offsetD,
216 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
217 &jy2,&jz2,&jx3,&jy3,&jz3);
219 /* Calculate displacement vector */
220 dx00 = _mm256_sub_pd(ix0,jx0);
221 dy00 = _mm256_sub_pd(iy0,jy0);
222 dz00 = _mm256_sub_pd(iz0,jz0);
223 dx11 = _mm256_sub_pd(ix1,jx1);
224 dy11 = _mm256_sub_pd(iy1,jy1);
225 dz11 = _mm256_sub_pd(iz1,jz1);
226 dx12 = _mm256_sub_pd(ix1,jx2);
227 dy12 = _mm256_sub_pd(iy1,jy2);
228 dz12 = _mm256_sub_pd(iz1,jz2);
229 dx13 = _mm256_sub_pd(ix1,jx3);
230 dy13 = _mm256_sub_pd(iy1,jy3);
231 dz13 = _mm256_sub_pd(iz1,jz3);
232 dx21 = _mm256_sub_pd(ix2,jx1);
233 dy21 = _mm256_sub_pd(iy2,jy1);
234 dz21 = _mm256_sub_pd(iz2,jz1);
235 dx22 = _mm256_sub_pd(ix2,jx2);
236 dy22 = _mm256_sub_pd(iy2,jy2);
237 dz22 = _mm256_sub_pd(iz2,jz2);
238 dx23 = _mm256_sub_pd(ix2,jx3);
239 dy23 = _mm256_sub_pd(iy2,jy3);
240 dz23 = _mm256_sub_pd(iz2,jz3);
241 dx31 = _mm256_sub_pd(ix3,jx1);
242 dy31 = _mm256_sub_pd(iy3,jy1);
243 dz31 = _mm256_sub_pd(iz3,jz1);
244 dx32 = _mm256_sub_pd(ix3,jx2);
245 dy32 = _mm256_sub_pd(iy3,jy2);
246 dz32 = _mm256_sub_pd(iz3,jz2);
247 dx33 = _mm256_sub_pd(ix3,jx3);
248 dy33 = _mm256_sub_pd(iy3,jy3);
249 dz33 = _mm256_sub_pd(iz3,jz3);
251 /* Calculate squared distance and things based on it */
252 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
253 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
254 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
255 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
256 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
257 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
258 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
259 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
260 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
261 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
263 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
264 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
265 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
266 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
267 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
268 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
269 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
270 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
271 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
273 rinvsq00 = gmx_mm256_inv_pd(rsq00);
274 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
275 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
276 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
277 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
278 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
279 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
280 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
281 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
282 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
284 fjx0 = _mm256_setzero_pd();
285 fjy0 = _mm256_setzero_pd();
286 fjz0 = _mm256_setzero_pd();
287 fjx1 = _mm256_setzero_pd();
288 fjy1 = _mm256_setzero_pd();
289 fjz1 = _mm256_setzero_pd();
290 fjx2 = _mm256_setzero_pd();
291 fjy2 = _mm256_setzero_pd();
292 fjz2 = _mm256_setzero_pd();
293 fjx3 = _mm256_setzero_pd();
294 fjy3 = _mm256_setzero_pd();
295 fjz3 = _mm256_setzero_pd();
297 /**************************
298 * CALCULATE INTERACTIONS *
299 **************************/
301 /* LENNARD-JONES DISPERSION/REPULSION */
303 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
304 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
305 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
306 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
307 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
309 /* Update potential sum for this i atom from the interaction with this j atom. */
310 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
314 /* Calculate temporary vectorial force */
315 tx = _mm256_mul_pd(fscal,dx00);
316 ty = _mm256_mul_pd(fscal,dy00);
317 tz = _mm256_mul_pd(fscal,dz00);
319 /* Update vectorial force */
320 fix0 = _mm256_add_pd(fix0,tx);
321 fiy0 = _mm256_add_pd(fiy0,ty);
322 fiz0 = _mm256_add_pd(fiz0,tz);
324 fjx0 = _mm256_add_pd(fjx0,tx);
325 fjy0 = _mm256_add_pd(fjy0,ty);
326 fjz0 = _mm256_add_pd(fjz0,tz);
328 /**************************
329 * CALCULATE INTERACTIONS *
330 **************************/
332 /* COULOMB ELECTROSTATICS */
333 velec = _mm256_mul_pd(qq11,rinv11);
334 felec = _mm256_mul_pd(velec,rinvsq11);
336 /* Update potential sum for this i atom from the interaction with this j atom. */
337 velecsum = _mm256_add_pd(velecsum,velec);
341 /* Calculate temporary vectorial force */
342 tx = _mm256_mul_pd(fscal,dx11);
343 ty = _mm256_mul_pd(fscal,dy11);
344 tz = _mm256_mul_pd(fscal,dz11);
346 /* Update vectorial force */
347 fix1 = _mm256_add_pd(fix1,tx);
348 fiy1 = _mm256_add_pd(fiy1,ty);
349 fiz1 = _mm256_add_pd(fiz1,tz);
351 fjx1 = _mm256_add_pd(fjx1,tx);
352 fjy1 = _mm256_add_pd(fjy1,ty);
353 fjz1 = _mm256_add_pd(fjz1,tz);
355 /**************************
356 * CALCULATE INTERACTIONS *
357 **************************/
359 /* COULOMB ELECTROSTATICS */
360 velec = _mm256_mul_pd(qq12,rinv12);
361 felec = _mm256_mul_pd(velec,rinvsq12);
363 /* Update potential sum for this i atom from the interaction with this j atom. */
364 velecsum = _mm256_add_pd(velecsum,velec);
368 /* Calculate temporary vectorial force */
369 tx = _mm256_mul_pd(fscal,dx12);
370 ty = _mm256_mul_pd(fscal,dy12);
371 tz = _mm256_mul_pd(fscal,dz12);
373 /* Update vectorial force */
374 fix1 = _mm256_add_pd(fix1,tx);
375 fiy1 = _mm256_add_pd(fiy1,ty);
376 fiz1 = _mm256_add_pd(fiz1,tz);
378 fjx2 = _mm256_add_pd(fjx2,tx);
379 fjy2 = _mm256_add_pd(fjy2,ty);
380 fjz2 = _mm256_add_pd(fjz2,tz);
382 /**************************
383 * CALCULATE INTERACTIONS *
384 **************************/
386 /* COULOMB ELECTROSTATICS */
387 velec = _mm256_mul_pd(qq13,rinv13);
388 felec = _mm256_mul_pd(velec,rinvsq13);
390 /* Update potential sum for this i atom from the interaction with this j atom. */
391 velecsum = _mm256_add_pd(velecsum,velec);
395 /* Calculate temporary vectorial force */
396 tx = _mm256_mul_pd(fscal,dx13);
397 ty = _mm256_mul_pd(fscal,dy13);
398 tz = _mm256_mul_pd(fscal,dz13);
400 /* Update vectorial force */
401 fix1 = _mm256_add_pd(fix1,tx);
402 fiy1 = _mm256_add_pd(fiy1,ty);
403 fiz1 = _mm256_add_pd(fiz1,tz);
405 fjx3 = _mm256_add_pd(fjx3,tx);
406 fjy3 = _mm256_add_pd(fjy3,ty);
407 fjz3 = _mm256_add_pd(fjz3,tz);
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 /* COULOMB ELECTROSTATICS */
414 velec = _mm256_mul_pd(qq21,rinv21);
415 felec = _mm256_mul_pd(velec,rinvsq21);
417 /* Update potential sum for this i atom from the interaction with this j atom. */
418 velecsum = _mm256_add_pd(velecsum,velec);
422 /* Calculate temporary vectorial force */
423 tx = _mm256_mul_pd(fscal,dx21);
424 ty = _mm256_mul_pd(fscal,dy21);
425 tz = _mm256_mul_pd(fscal,dz21);
427 /* Update vectorial force */
428 fix2 = _mm256_add_pd(fix2,tx);
429 fiy2 = _mm256_add_pd(fiy2,ty);
430 fiz2 = _mm256_add_pd(fiz2,tz);
432 fjx1 = _mm256_add_pd(fjx1,tx);
433 fjy1 = _mm256_add_pd(fjy1,ty);
434 fjz1 = _mm256_add_pd(fjz1,tz);
436 /**************************
437 * CALCULATE INTERACTIONS *
438 **************************/
440 /* COULOMB ELECTROSTATICS */
441 velec = _mm256_mul_pd(qq22,rinv22);
442 felec = _mm256_mul_pd(velec,rinvsq22);
444 /* Update potential sum for this i atom from the interaction with this j atom. */
445 velecsum = _mm256_add_pd(velecsum,velec);
449 /* Calculate temporary vectorial force */
450 tx = _mm256_mul_pd(fscal,dx22);
451 ty = _mm256_mul_pd(fscal,dy22);
452 tz = _mm256_mul_pd(fscal,dz22);
454 /* Update vectorial force */
455 fix2 = _mm256_add_pd(fix2,tx);
456 fiy2 = _mm256_add_pd(fiy2,ty);
457 fiz2 = _mm256_add_pd(fiz2,tz);
459 fjx2 = _mm256_add_pd(fjx2,tx);
460 fjy2 = _mm256_add_pd(fjy2,ty);
461 fjz2 = _mm256_add_pd(fjz2,tz);
463 /**************************
464 * CALCULATE INTERACTIONS *
465 **************************/
467 /* COULOMB ELECTROSTATICS */
468 velec = _mm256_mul_pd(qq23,rinv23);
469 felec = _mm256_mul_pd(velec,rinvsq23);
471 /* Update potential sum for this i atom from the interaction with this j atom. */
472 velecsum = _mm256_add_pd(velecsum,velec);
476 /* Calculate temporary vectorial force */
477 tx = _mm256_mul_pd(fscal,dx23);
478 ty = _mm256_mul_pd(fscal,dy23);
479 tz = _mm256_mul_pd(fscal,dz23);
481 /* Update vectorial force */
482 fix2 = _mm256_add_pd(fix2,tx);
483 fiy2 = _mm256_add_pd(fiy2,ty);
484 fiz2 = _mm256_add_pd(fiz2,tz);
486 fjx3 = _mm256_add_pd(fjx3,tx);
487 fjy3 = _mm256_add_pd(fjy3,ty);
488 fjz3 = _mm256_add_pd(fjz3,tz);
490 /**************************
491 * CALCULATE INTERACTIONS *
492 **************************/
494 /* COULOMB ELECTROSTATICS */
495 velec = _mm256_mul_pd(qq31,rinv31);
496 felec = _mm256_mul_pd(velec,rinvsq31);
498 /* Update potential sum for this i atom from the interaction with this j atom. */
499 velecsum = _mm256_add_pd(velecsum,velec);
503 /* Calculate temporary vectorial force */
504 tx = _mm256_mul_pd(fscal,dx31);
505 ty = _mm256_mul_pd(fscal,dy31);
506 tz = _mm256_mul_pd(fscal,dz31);
508 /* Update vectorial force */
509 fix3 = _mm256_add_pd(fix3,tx);
510 fiy3 = _mm256_add_pd(fiy3,ty);
511 fiz3 = _mm256_add_pd(fiz3,tz);
513 fjx1 = _mm256_add_pd(fjx1,tx);
514 fjy1 = _mm256_add_pd(fjy1,ty);
515 fjz1 = _mm256_add_pd(fjz1,tz);
517 /**************************
518 * CALCULATE INTERACTIONS *
519 **************************/
521 /* COULOMB ELECTROSTATICS */
522 velec = _mm256_mul_pd(qq32,rinv32);
523 felec = _mm256_mul_pd(velec,rinvsq32);
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velecsum = _mm256_add_pd(velecsum,velec);
530 /* Calculate temporary vectorial force */
531 tx = _mm256_mul_pd(fscal,dx32);
532 ty = _mm256_mul_pd(fscal,dy32);
533 tz = _mm256_mul_pd(fscal,dz32);
535 /* Update vectorial force */
536 fix3 = _mm256_add_pd(fix3,tx);
537 fiy3 = _mm256_add_pd(fiy3,ty);
538 fiz3 = _mm256_add_pd(fiz3,tz);
540 fjx2 = _mm256_add_pd(fjx2,tx);
541 fjy2 = _mm256_add_pd(fjy2,ty);
542 fjz2 = _mm256_add_pd(fjz2,tz);
544 /**************************
545 * CALCULATE INTERACTIONS *
546 **************************/
548 /* COULOMB ELECTROSTATICS */
549 velec = _mm256_mul_pd(qq33,rinv33);
550 felec = _mm256_mul_pd(velec,rinvsq33);
552 /* Update potential sum for this i atom from the interaction with this j atom. */
553 velecsum = _mm256_add_pd(velecsum,velec);
557 /* Calculate temporary vectorial force */
558 tx = _mm256_mul_pd(fscal,dx33);
559 ty = _mm256_mul_pd(fscal,dy33);
560 tz = _mm256_mul_pd(fscal,dz33);
562 /* Update vectorial force */
563 fix3 = _mm256_add_pd(fix3,tx);
564 fiy3 = _mm256_add_pd(fiy3,ty);
565 fiz3 = _mm256_add_pd(fiz3,tz);
567 fjx3 = _mm256_add_pd(fjx3,tx);
568 fjy3 = _mm256_add_pd(fjy3,ty);
569 fjz3 = _mm256_add_pd(fjz3,tz);
571 fjptrA = f+j_coord_offsetA;
572 fjptrB = f+j_coord_offsetB;
573 fjptrC = f+j_coord_offsetC;
574 fjptrD = f+j_coord_offsetD;
576 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
577 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
578 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
580 /* Inner loop uses 278 flops */
586 /* Get j neighbor index, and coordinate index */
587 jnrlistA = jjnr[jidx];
588 jnrlistB = jjnr[jidx+1];
589 jnrlistC = jjnr[jidx+2];
590 jnrlistD = jjnr[jidx+3];
591 /* Sign of each element will be negative for non-real atoms.
592 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
593 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
595 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
597 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
598 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
599 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
601 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
602 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
603 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
604 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
605 j_coord_offsetA = DIM*jnrA;
606 j_coord_offsetB = DIM*jnrB;
607 j_coord_offsetC = DIM*jnrC;
608 j_coord_offsetD = DIM*jnrD;
610 /* load j atom coordinates */
611 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
612 x+j_coord_offsetC,x+j_coord_offsetD,
613 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
614 &jy2,&jz2,&jx3,&jy3,&jz3);
616 /* Calculate displacement vector */
617 dx00 = _mm256_sub_pd(ix0,jx0);
618 dy00 = _mm256_sub_pd(iy0,jy0);
619 dz00 = _mm256_sub_pd(iz0,jz0);
620 dx11 = _mm256_sub_pd(ix1,jx1);
621 dy11 = _mm256_sub_pd(iy1,jy1);
622 dz11 = _mm256_sub_pd(iz1,jz1);
623 dx12 = _mm256_sub_pd(ix1,jx2);
624 dy12 = _mm256_sub_pd(iy1,jy2);
625 dz12 = _mm256_sub_pd(iz1,jz2);
626 dx13 = _mm256_sub_pd(ix1,jx3);
627 dy13 = _mm256_sub_pd(iy1,jy3);
628 dz13 = _mm256_sub_pd(iz1,jz3);
629 dx21 = _mm256_sub_pd(ix2,jx1);
630 dy21 = _mm256_sub_pd(iy2,jy1);
631 dz21 = _mm256_sub_pd(iz2,jz1);
632 dx22 = _mm256_sub_pd(ix2,jx2);
633 dy22 = _mm256_sub_pd(iy2,jy2);
634 dz22 = _mm256_sub_pd(iz2,jz2);
635 dx23 = _mm256_sub_pd(ix2,jx3);
636 dy23 = _mm256_sub_pd(iy2,jy3);
637 dz23 = _mm256_sub_pd(iz2,jz3);
638 dx31 = _mm256_sub_pd(ix3,jx1);
639 dy31 = _mm256_sub_pd(iy3,jy1);
640 dz31 = _mm256_sub_pd(iz3,jz1);
641 dx32 = _mm256_sub_pd(ix3,jx2);
642 dy32 = _mm256_sub_pd(iy3,jy2);
643 dz32 = _mm256_sub_pd(iz3,jz2);
644 dx33 = _mm256_sub_pd(ix3,jx3);
645 dy33 = _mm256_sub_pd(iy3,jy3);
646 dz33 = _mm256_sub_pd(iz3,jz3);
648 /* Calculate squared distance and things based on it */
649 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
650 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
651 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
652 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
653 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
654 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
655 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
656 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
657 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
658 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
660 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
661 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
662 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
663 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
664 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
665 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
666 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
667 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
668 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
670 rinvsq00 = gmx_mm256_inv_pd(rsq00);
671 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
672 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
673 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
674 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
675 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
676 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
677 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
678 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
679 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
681 fjx0 = _mm256_setzero_pd();
682 fjy0 = _mm256_setzero_pd();
683 fjz0 = _mm256_setzero_pd();
684 fjx1 = _mm256_setzero_pd();
685 fjy1 = _mm256_setzero_pd();
686 fjz1 = _mm256_setzero_pd();
687 fjx2 = _mm256_setzero_pd();
688 fjy2 = _mm256_setzero_pd();
689 fjz2 = _mm256_setzero_pd();
690 fjx3 = _mm256_setzero_pd();
691 fjy3 = _mm256_setzero_pd();
692 fjz3 = _mm256_setzero_pd();
694 /**************************
695 * CALCULATE INTERACTIONS *
696 **************************/
698 /* LENNARD-JONES DISPERSION/REPULSION */
700 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
701 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
702 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
703 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
704 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
706 /* Update potential sum for this i atom from the interaction with this j atom. */
707 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
708 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
712 fscal = _mm256_andnot_pd(dummy_mask,fscal);
714 /* Calculate temporary vectorial force */
715 tx = _mm256_mul_pd(fscal,dx00);
716 ty = _mm256_mul_pd(fscal,dy00);
717 tz = _mm256_mul_pd(fscal,dz00);
719 /* Update vectorial force */
720 fix0 = _mm256_add_pd(fix0,tx);
721 fiy0 = _mm256_add_pd(fiy0,ty);
722 fiz0 = _mm256_add_pd(fiz0,tz);
724 fjx0 = _mm256_add_pd(fjx0,tx);
725 fjy0 = _mm256_add_pd(fjy0,ty);
726 fjz0 = _mm256_add_pd(fjz0,tz);
728 /**************************
729 * CALCULATE INTERACTIONS *
730 **************************/
732 /* COULOMB ELECTROSTATICS */
733 velec = _mm256_mul_pd(qq11,rinv11);
734 felec = _mm256_mul_pd(velec,rinvsq11);
736 /* Update potential sum for this i atom from the interaction with this j atom. */
737 velec = _mm256_andnot_pd(dummy_mask,velec);
738 velecsum = _mm256_add_pd(velecsum,velec);
742 fscal = _mm256_andnot_pd(dummy_mask,fscal);
744 /* Calculate temporary vectorial force */
745 tx = _mm256_mul_pd(fscal,dx11);
746 ty = _mm256_mul_pd(fscal,dy11);
747 tz = _mm256_mul_pd(fscal,dz11);
749 /* Update vectorial force */
750 fix1 = _mm256_add_pd(fix1,tx);
751 fiy1 = _mm256_add_pd(fiy1,ty);
752 fiz1 = _mm256_add_pd(fiz1,tz);
754 fjx1 = _mm256_add_pd(fjx1,tx);
755 fjy1 = _mm256_add_pd(fjy1,ty);
756 fjz1 = _mm256_add_pd(fjz1,tz);
758 /**************************
759 * CALCULATE INTERACTIONS *
760 **************************/
762 /* COULOMB ELECTROSTATICS */
763 velec = _mm256_mul_pd(qq12,rinv12);
764 felec = _mm256_mul_pd(velec,rinvsq12);
766 /* Update potential sum for this i atom from the interaction with this j atom. */
767 velec = _mm256_andnot_pd(dummy_mask,velec);
768 velecsum = _mm256_add_pd(velecsum,velec);
772 fscal = _mm256_andnot_pd(dummy_mask,fscal);
774 /* Calculate temporary vectorial force */
775 tx = _mm256_mul_pd(fscal,dx12);
776 ty = _mm256_mul_pd(fscal,dy12);
777 tz = _mm256_mul_pd(fscal,dz12);
779 /* Update vectorial force */
780 fix1 = _mm256_add_pd(fix1,tx);
781 fiy1 = _mm256_add_pd(fiy1,ty);
782 fiz1 = _mm256_add_pd(fiz1,tz);
784 fjx2 = _mm256_add_pd(fjx2,tx);
785 fjy2 = _mm256_add_pd(fjy2,ty);
786 fjz2 = _mm256_add_pd(fjz2,tz);
788 /**************************
789 * CALCULATE INTERACTIONS *
790 **************************/
792 /* COULOMB ELECTROSTATICS */
793 velec = _mm256_mul_pd(qq13,rinv13);
794 felec = _mm256_mul_pd(velec,rinvsq13);
796 /* Update potential sum for this i atom from the interaction with this j atom. */
797 velec = _mm256_andnot_pd(dummy_mask,velec);
798 velecsum = _mm256_add_pd(velecsum,velec);
802 fscal = _mm256_andnot_pd(dummy_mask,fscal);
804 /* Calculate temporary vectorial force */
805 tx = _mm256_mul_pd(fscal,dx13);
806 ty = _mm256_mul_pd(fscal,dy13);
807 tz = _mm256_mul_pd(fscal,dz13);
809 /* Update vectorial force */
810 fix1 = _mm256_add_pd(fix1,tx);
811 fiy1 = _mm256_add_pd(fiy1,ty);
812 fiz1 = _mm256_add_pd(fiz1,tz);
814 fjx3 = _mm256_add_pd(fjx3,tx);
815 fjy3 = _mm256_add_pd(fjy3,ty);
816 fjz3 = _mm256_add_pd(fjz3,tz);
818 /**************************
819 * CALCULATE INTERACTIONS *
820 **************************/
822 /* COULOMB ELECTROSTATICS */
823 velec = _mm256_mul_pd(qq21,rinv21);
824 felec = _mm256_mul_pd(velec,rinvsq21);
826 /* Update potential sum for this i atom from the interaction with this j atom. */
827 velec = _mm256_andnot_pd(dummy_mask,velec);
828 velecsum = _mm256_add_pd(velecsum,velec);
832 fscal = _mm256_andnot_pd(dummy_mask,fscal);
834 /* Calculate temporary vectorial force */
835 tx = _mm256_mul_pd(fscal,dx21);
836 ty = _mm256_mul_pd(fscal,dy21);
837 tz = _mm256_mul_pd(fscal,dz21);
839 /* Update vectorial force */
840 fix2 = _mm256_add_pd(fix2,tx);
841 fiy2 = _mm256_add_pd(fiy2,ty);
842 fiz2 = _mm256_add_pd(fiz2,tz);
844 fjx1 = _mm256_add_pd(fjx1,tx);
845 fjy1 = _mm256_add_pd(fjy1,ty);
846 fjz1 = _mm256_add_pd(fjz1,tz);
848 /**************************
849 * CALCULATE INTERACTIONS *
850 **************************/
852 /* COULOMB ELECTROSTATICS */
853 velec = _mm256_mul_pd(qq22,rinv22);
854 felec = _mm256_mul_pd(velec,rinvsq22);
856 /* Update potential sum for this i atom from the interaction with this j atom. */
857 velec = _mm256_andnot_pd(dummy_mask,velec);
858 velecsum = _mm256_add_pd(velecsum,velec);
862 fscal = _mm256_andnot_pd(dummy_mask,fscal);
864 /* Calculate temporary vectorial force */
865 tx = _mm256_mul_pd(fscal,dx22);
866 ty = _mm256_mul_pd(fscal,dy22);
867 tz = _mm256_mul_pd(fscal,dz22);
869 /* Update vectorial force */
870 fix2 = _mm256_add_pd(fix2,tx);
871 fiy2 = _mm256_add_pd(fiy2,ty);
872 fiz2 = _mm256_add_pd(fiz2,tz);
874 fjx2 = _mm256_add_pd(fjx2,tx);
875 fjy2 = _mm256_add_pd(fjy2,ty);
876 fjz2 = _mm256_add_pd(fjz2,tz);
878 /**************************
879 * CALCULATE INTERACTIONS *
880 **************************/
882 /* COULOMB ELECTROSTATICS */
883 velec = _mm256_mul_pd(qq23,rinv23);
884 felec = _mm256_mul_pd(velec,rinvsq23);
886 /* Update potential sum for this i atom from the interaction with this j atom. */
887 velec = _mm256_andnot_pd(dummy_mask,velec);
888 velecsum = _mm256_add_pd(velecsum,velec);
892 fscal = _mm256_andnot_pd(dummy_mask,fscal);
894 /* Calculate temporary vectorial force */
895 tx = _mm256_mul_pd(fscal,dx23);
896 ty = _mm256_mul_pd(fscal,dy23);
897 tz = _mm256_mul_pd(fscal,dz23);
899 /* Update vectorial force */
900 fix2 = _mm256_add_pd(fix2,tx);
901 fiy2 = _mm256_add_pd(fiy2,ty);
902 fiz2 = _mm256_add_pd(fiz2,tz);
904 fjx3 = _mm256_add_pd(fjx3,tx);
905 fjy3 = _mm256_add_pd(fjy3,ty);
906 fjz3 = _mm256_add_pd(fjz3,tz);
908 /**************************
909 * CALCULATE INTERACTIONS *
910 **************************/
912 /* COULOMB ELECTROSTATICS */
913 velec = _mm256_mul_pd(qq31,rinv31);
914 felec = _mm256_mul_pd(velec,rinvsq31);
916 /* Update potential sum for this i atom from the interaction with this j atom. */
917 velec = _mm256_andnot_pd(dummy_mask,velec);
918 velecsum = _mm256_add_pd(velecsum,velec);
922 fscal = _mm256_andnot_pd(dummy_mask,fscal);
924 /* Calculate temporary vectorial force */
925 tx = _mm256_mul_pd(fscal,dx31);
926 ty = _mm256_mul_pd(fscal,dy31);
927 tz = _mm256_mul_pd(fscal,dz31);
929 /* Update vectorial force */
930 fix3 = _mm256_add_pd(fix3,tx);
931 fiy3 = _mm256_add_pd(fiy3,ty);
932 fiz3 = _mm256_add_pd(fiz3,tz);
934 fjx1 = _mm256_add_pd(fjx1,tx);
935 fjy1 = _mm256_add_pd(fjy1,ty);
936 fjz1 = _mm256_add_pd(fjz1,tz);
938 /**************************
939 * CALCULATE INTERACTIONS *
940 **************************/
942 /* COULOMB ELECTROSTATICS */
943 velec = _mm256_mul_pd(qq32,rinv32);
944 felec = _mm256_mul_pd(velec,rinvsq32);
946 /* Update potential sum for this i atom from the interaction with this j atom. */
947 velec = _mm256_andnot_pd(dummy_mask,velec);
948 velecsum = _mm256_add_pd(velecsum,velec);
952 fscal = _mm256_andnot_pd(dummy_mask,fscal);
954 /* Calculate temporary vectorial force */
955 tx = _mm256_mul_pd(fscal,dx32);
956 ty = _mm256_mul_pd(fscal,dy32);
957 tz = _mm256_mul_pd(fscal,dz32);
959 /* Update vectorial force */
960 fix3 = _mm256_add_pd(fix3,tx);
961 fiy3 = _mm256_add_pd(fiy3,ty);
962 fiz3 = _mm256_add_pd(fiz3,tz);
964 fjx2 = _mm256_add_pd(fjx2,tx);
965 fjy2 = _mm256_add_pd(fjy2,ty);
966 fjz2 = _mm256_add_pd(fjz2,tz);
968 /**************************
969 * CALCULATE INTERACTIONS *
970 **************************/
972 /* COULOMB ELECTROSTATICS */
973 velec = _mm256_mul_pd(qq33,rinv33);
974 felec = _mm256_mul_pd(velec,rinvsq33);
976 /* Update potential sum for this i atom from the interaction with this j atom. */
977 velec = _mm256_andnot_pd(dummy_mask,velec);
978 velecsum = _mm256_add_pd(velecsum,velec);
982 fscal = _mm256_andnot_pd(dummy_mask,fscal);
984 /* Calculate temporary vectorial force */
985 tx = _mm256_mul_pd(fscal,dx33);
986 ty = _mm256_mul_pd(fscal,dy33);
987 tz = _mm256_mul_pd(fscal,dz33);
989 /* Update vectorial force */
990 fix3 = _mm256_add_pd(fix3,tx);
991 fiy3 = _mm256_add_pd(fiy3,ty);
992 fiz3 = _mm256_add_pd(fiz3,tz);
994 fjx3 = _mm256_add_pd(fjx3,tx);
995 fjy3 = _mm256_add_pd(fjy3,ty);
996 fjz3 = _mm256_add_pd(fjz3,tz);
998 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
999 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1000 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1001 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1003 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1004 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1005 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1007 /* Inner loop uses 278 flops */
1010 /* End of innermost loop */
1012 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1013 f+i_coord_offset,fshift+i_shift_offset);
1016 /* Update potential energies */
1017 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
1018 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
1020 /* Increment number of inner iterations */
1021 inneriter += j_index_end - j_index_start;
1023 /* Outer loop uses 26 flops */
1026 /* Increment number of outer iterations */
1029 /* Update outer/inner flops */
1031 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_VF,outeriter*26 + inneriter*278);
1034 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_256_double
1035 * Electrostatics interaction: Coulomb
1036 * VdW interaction: LennardJones
1037 * Geometry: Water4-Water4
1038 * Calculate force/pot: Force
1041 nb_kernel_ElecCoul_VdwLJ_GeomW4W4_F_avx_256_double
1042 (t_nblist * gmx_restrict nlist,
1043 rvec * gmx_restrict xx,
1044 rvec * gmx_restrict ff,
1045 t_forcerec * gmx_restrict fr,
1046 t_mdatoms * gmx_restrict mdatoms,
1047 nb_kernel_data_t * gmx_restrict kernel_data,
1048 t_nrnb * gmx_restrict nrnb)
1050 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
1051 * just 0 for non-waters.
1052 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
1053 * jnr indices corresponding to data put in the four positions in the SIMD register.
1055 int i_shift_offset,i_coord_offset,outeriter,inneriter;
1056 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
1057 int jnrA,jnrB,jnrC,jnrD;
1058 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
1059 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
1060 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
1061 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
1062 real rcutoff_scalar;
1063 real *shiftvec,*fshift,*x,*f;
1064 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
1065 real scratch[4*DIM];
1066 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
1067 real * vdwioffsetptr0;
1068 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
1069 real * vdwioffsetptr1;
1070 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
1071 real * vdwioffsetptr2;
1072 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
1073 real * vdwioffsetptr3;
1074 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
1075 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1076 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1077 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1078 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1079 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1080 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1081 int vdwjidx3A,vdwjidx3B,vdwjidx3C,vdwjidx3D;
1082 __m256d jx3,jy3,jz3,fjx3,fjy3,fjz3,jq3,isaj3;
1083 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1084 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1085 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1086 __m256d dx13,dy13,dz13,rsq13,rinv13,rinvsq13,r13,qq13,c6_13,c12_13;
1087 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1088 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1089 __m256d dx23,dy23,dz23,rsq23,rinv23,rinvsq23,r23,qq23,c6_23,c12_23;
1090 __m256d dx31,dy31,dz31,rsq31,rinv31,rinvsq31,r31,qq31,c6_31,c12_31;
1091 __m256d dx32,dy32,dz32,rsq32,rinv32,rinvsq32,r32,qq32,c6_32,c12_32;
1092 __m256d dx33,dy33,dz33,rsq33,rinv33,rinvsq33,r33,qq33,c6_33,c12_33;
1093 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1096 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1099 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1100 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1101 __m256d dummy_mask,cutoff_mask;
1102 __m128 tmpmask0,tmpmask1;
1103 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1104 __m256d one = _mm256_set1_pd(1.0);
1105 __m256d two = _mm256_set1_pd(2.0);
1111 jindex = nlist->jindex;
1113 shiftidx = nlist->shift;
1115 shiftvec = fr->shift_vec[0];
1116 fshift = fr->fshift[0];
1117 facel = _mm256_set1_pd(fr->epsfac);
1118 charge = mdatoms->chargeA;
1119 nvdwtype = fr->ntype;
1120 vdwparam = fr->nbfp;
1121 vdwtype = mdatoms->typeA;
1123 /* Setup water-specific parameters */
1124 inr = nlist->iinr[0];
1125 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1126 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1127 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
1128 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1130 jq1 = _mm256_set1_pd(charge[inr+1]);
1131 jq2 = _mm256_set1_pd(charge[inr+2]);
1132 jq3 = _mm256_set1_pd(charge[inr+3]);
1133 vdwjidx0A = 2*vdwtype[inr+0];
1134 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1135 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1136 qq11 = _mm256_mul_pd(iq1,jq1);
1137 qq12 = _mm256_mul_pd(iq1,jq2);
1138 qq13 = _mm256_mul_pd(iq1,jq3);
1139 qq21 = _mm256_mul_pd(iq2,jq1);
1140 qq22 = _mm256_mul_pd(iq2,jq2);
1141 qq23 = _mm256_mul_pd(iq2,jq3);
1142 qq31 = _mm256_mul_pd(iq3,jq1);
1143 qq32 = _mm256_mul_pd(iq3,jq2);
1144 qq33 = _mm256_mul_pd(iq3,jq3);
1146 /* Avoid stupid compiler warnings */
1147 jnrA = jnrB = jnrC = jnrD = 0;
1148 j_coord_offsetA = 0;
1149 j_coord_offsetB = 0;
1150 j_coord_offsetC = 0;
1151 j_coord_offsetD = 0;
1156 for(iidx=0;iidx<4*DIM;iidx++)
1158 scratch[iidx] = 0.0;
1161 /* Start outer loop over neighborlists */
1162 for(iidx=0; iidx<nri; iidx++)
1164 /* Load shift vector for this list */
1165 i_shift_offset = DIM*shiftidx[iidx];
1167 /* Load limits for loop over neighbors */
1168 j_index_start = jindex[iidx];
1169 j_index_end = jindex[iidx+1];
1171 /* Get outer coordinate index */
1173 i_coord_offset = DIM*inr;
1175 /* Load i particle coords and add shift vector */
1176 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1177 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
1179 fix0 = _mm256_setzero_pd();
1180 fiy0 = _mm256_setzero_pd();
1181 fiz0 = _mm256_setzero_pd();
1182 fix1 = _mm256_setzero_pd();
1183 fiy1 = _mm256_setzero_pd();
1184 fiz1 = _mm256_setzero_pd();
1185 fix2 = _mm256_setzero_pd();
1186 fiy2 = _mm256_setzero_pd();
1187 fiz2 = _mm256_setzero_pd();
1188 fix3 = _mm256_setzero_pd();
1189 fiy3 = _mm256_setzero_pd();
1190 fiz3 = _mm256_setzero_pd();
1192 /* Start inner kernel loop */
1193 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1196 /* Get j neighbor index, and coordinate index */
1198 jnrB = jjnr[jidx+1];
1199 jnrC = jjnr[jidx+2];
1200 jnrD = jjnr[jidx+3];
1201 j_coord_offsetA = DIM*jnrA;
1202 j_coord_offsetB = DIM*jnrB;
1203 j_coord_offsetC = DIM*jnrC;
1204 j_coord_offsetD = DIM*jnrD;
1206 /* load j atom coordinates */
1207 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1208 x+j_coord_offsetC,x+j_coord_offsetD,
1209 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1210 &jy2,&jz2,&jx3,&jy3,&jz3);
1212 /* Calculate displacement vector */
1213 dx00 = _mm256_sub_pd(ix0,jx0);
1214 dy00 = _mm256_sub_pd(iy0,jy0);
1215 dz00 = _mm256_sub_pd(iz0,jz0);
1216 dx11 = _mm256_sub_pd(ix1,jx1);
1217 dy11 = _mm256_sub_pd(iy1,jy1);
1218 dz11 = _mm256_sub_pd(iz1,jz1);
1219 dx12 = _mm256_sub_pd(ix1,jx2);
1220 dy12 = _mm256_sub_pd(iy1,jy2);
1221 dz12 = _mm256_sub_pd(iz1,jz2);
1222 dx13 = _mm256_sub_pd(ix1,jx3);
1223 dy13 = _mm256_sub_pd(iy1,jy3);
1224 dz13 = _mm256_sub_pd(iz1,jz3);
1225 dx21 = _mm256_sub_pd(ix2,jx1);
1226 dy21 = _mm256_sub_pd(iy2,jy1);
1227 dz21 = _mm256_sub_pd(iz2,jz1);
1228 dx22 = _mm256_sub_pd(ix2,jx2);
1229 dy22 = _mm256_sub_pd(iy2,jy2);
1230 dz22 = _mm256_sub_pd(iz2,jz2);
1231 dx23 = _mm256_sub_pd(ix2,jx3);
1232 dy23 = _mm256_sub_pd(iy2,jy3);
1233 dz23 = _mm256_sub_pd(iz2,jz3);
1234 dx31 = _mm256_sub_pd(ix3,jx1);
1235 dy31 = _mm256_sub_pd(iy3,jy1);
1236 dz31 = _mm256_sub_pd(iz3,jz1);
1237 dx32 = _mm256_sub_pd(ix3,jx2);
1238 dy32 = _mm256_sub_pd(iy3,jy2);
1239 dz32 = _mm256_sub_pd(iz3,jz2);
1240 dx33 = _mm256_sub_pd(ix3,jx3);
1241 dy33 = _mm256_sub_pd(iy3,jy3);
1242 dz33 = _mm256_sub_pd(iz3,jz3);
1244 /* Calculate squared distance and things based on it */
1245 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1246 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1247 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1248 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1249 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1250 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1251 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1252 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1253 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1254 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1256 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1257 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1258 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1259 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1260 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1261 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1262 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1263 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1264 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1266 rinvsq00 = gmx_mm256_inv_pd(rsq00);
1267 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1268 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1269 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1270 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1271 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1272 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1273 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1274 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1275 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1277 fjx0 = _mm256_setzero_pd();
1278 fjy0 = _mm256_setzero_pd();
1279 fjz0 = _mm256_setzero_pd();
1280 fjx1 = _mm256_setzero_pd();
1281 fjy1 = _mm256_setzero_pd();
1282 fjz1 = _mm256_setzero_pd();
1283 fjx2 = _mm256_setzero_pd();
1284 fjy2 = _mm256_setzero_pd();
1285 fjz2 = _mm256_setzero_pd();
1286 fjx3 = _mm256_setzero_pd();
1287 fjy3 = _mm256_setzero_pd();
1288 fjz3 = _mm256_setzero_pd();
1290 /**************************
1291 * CALCULATE INTERACTIONS *
1292 **************************/
1294 /* LENNARD-JONES DISPERSION/REPULSION */
1296 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1297 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1301 /* Calculate temporary vectorial force */
1302 tx = _mm256_mul_pd(fscal,dx00);
1303 ty = _mm256_mul_pd(fscal,dy00);
1304 tz = _mm256_mul_pd(fscal,dz00);
1306 /* Update vectorial force */
1307 fix0 = _mm256_add_pd(fix0,tx);
1308 fiy0 = _mm256_add_pd(fiy0,ty);
1309 fiz0 = _mm256_add_pd(fiz0,tz);
1311 fjx0 = _mm256_add_pd(fjx0,tx);
1312 fjy0 = _mm256_add_pd(fjy0,ty);
1313 fjz0 = _mm256_add_pd(fjz0,tz);
1315 /**************************
1316 * CALCULATE INTERACTIONS *
1317 **************************/
1319 /* COULOMB ELECTROSTATICS */
1320 velec = _mm256_mul_pd(qq11,rinv11);
1321 felec = _mm256_mul_pd(velec,rinvsq11);
1325 /* Calculate temporary vectorial force */
1326 tx = _mm256_mul_pd(fscal,dx11);
1327 ty = _mm256_mul_pd(fscal,dy11);
1328 tz = _mm256_mul_pd(fscal,dz11);
1330 /* Update vectorial force */
1331 fix1 = _mm256_add_pd(fix1,tx);
1332 fiy1 = _mm256_add_pd(fiy1,ty);
1333 fiz1 = _mm256_add_pd(fiz1,tz);
1335 fjx1 = _mm256_add_pd(fjx1,tx);
1336 fjy1 = _mm256_add_pd(fjy1,ty);
1337 fjz1 = _mm256_add_pd(fjz1,tz);
1339 /**************************
1340 * CALCULATE INTERACTIONS *
1341 **************************/
1343 /* COULOMB ELECTROSTATICS */
1344 velec = _mm256_mul_pd(qq12,rinv12);
1345 felec = _mm256_mul_pd(velec,rinvsq12);
1349 /* Calculate temporary vectorial force */
1350 tx = _mm256_mul_pd(fscal,dx12);
1351 ty = _mm256_mul_pd(fscal,dy12);
1352 tz = _mm256_mul_pd(fscal,dz12);
1354 /* Update vectorial force */
1355 fix1 = _mm256_add_pd(fix1,tx);
1356 fiy1 = _mm256_add_pd(fiy1,ty);
1357 fiz1 = _mm256_add_pd(fiz1,tz);
1359 fjx2 = _mm256_add_pd(fjx2,tx);
1360 fjy2 = _mm256_add_pd(fjy2,ty);
1361 fjz2 = _mm256_add_pd(fjz2,tz);
1363 /**************************
1364 * CALCULATE INTERACTIONS *
1365 **************************/
1367 /* COULOMB ELECTROSTATICS */
1368 velec = _mm256_mul_pd(qq13,rinv13);
1369 felec = _mm256_mul_pd(velec,rinvsq13);
1373 /* Calculate temporary vectorial force */
1374 tx = _mm256_mul_pd(fscal,dx13);
1375 ty = _mm256_mul_pd(fscal,dy13);
1376 tz = _mm256_mul_pd(fscal,dz13);
1378 /* Update vectorial force */
1379 fix1 = _mm256_add_pd(fix1,tx);
1380 fiy1 = _mm256_add_pd(fiy1,ty);
1381 fiz1 = _mm256_add_pd(fiz1,tz);
1383 fjx3 = _mm256_add_pd(fjx3,tx);
1384 fjy3 = _mm256_add_pd(fjy3,ty);
1385 fjz3 = _mm256_add_pd(fjz3,tz);
1387 /**************************
1388 * CALCULATE INTERACTIONS *
1389 **************************/
1391 /* COULOMB ELECTROSTATICS */
1392 velec = _mm256_mul_pd(qq21,rinv21);
1393 felec = _mm256_mul_pd(velec,rinvsq21);
1397 /* Calculate temporary vectorial force */
1398 tx = _mm256_mul_pd(fscal,dx21);
1399 ty = _mm256_mul_pd(fscal,dy21);
1400 tz = _mm256_mul_pd(fscal,dz21);
1402 /* Update vectorial force */
1403 fix2 = _mm256_add_pd(fix2,tx);
1404 fiy2 = _mm256_add_pd(fiy2,ty);
1405 fiz2 = _mm256_add_pd(fiz2,tz);
1407 fjx1 = _mm256_add_pd(fjx1,tx);
1408 fjy1 = _mm256_add_pd(fjy1,ty);
1409 fjz1 = _mm256_add_pd(fjz1,tz);
1411 /**************************
1412 * CALCULATE INTERACTIONS *
1413 **************************/
1415 /* COULOMB ELECTROSTATICS */
1416 velec = _mm256_mul_pd(qq22,rinv22);
1417 felec = _mm256_mul_pd(velec,rinvsq22);
1421 /* Calculate temporary vectorial force */
1422 tx = _mm256_mul_pd(fscal,dx22);
1423 ty = _mm256_mul_pd(fscal,dy22);
1424 tz = _mm256_mul_pd(fscal,dz22);
1426 /* Update vectorial force */
1427 fix2 = _mm256_add_pd(fix2,tx);
1428 fiy2 = _mm256_add_pd(fiy2,ty);
1429 fiz2 = _mm256_add_pd(fiz2,tz);
1431 fjx2 = _mm256_add_pd(fjx2,tx);
1432 fjy2 = _mm256_add_pd(fjy2,ty);
1433 fjz2 = _mm256_add_pd(fjz2,tz);
1435 /**************************
1436 * CALCULATE INTERACTIONS *
1437 **************************/
1439 /* COULOMB ELECTROSTATICS */
1440 velec = _mm256_mul_pd(qq23,rinv23);
1441 felec = _mm256_mul_pd(velec,rinvsq23);
1445 /* Calculate temporary vectorial force */
1446 tx = _mm256_mul_pd(fscal,dx23);
1447 ty = _mm256_mul_pd(fscal,dy23);
1448 tz = _mm256_mul_pd(fscal,dz23);
1450 /* Update vectorial force */
1451 fix2 = _mm256_add_pd(fix2,tx);
1452 fiy2 = _mm256_add_pd(fiy2,ty);
1453 fiz2 = _mm256_add_pd(fiz2,tz);
1455 fjx3 = _mm256_add_pd(fjx3,tx);
1456 fjy3 = _mm256_add_pd(fjy3,ty);
1457 fjz3 = _mm256_add_pd(fjz3,tz);
1459 /**************************
1460 * CALCULATE INTERACTIONS *
1461 **************************/
1463 /* COULOMB ELECTROSTATICS */
1464 velec = _mm256_mul_pd(qq31,rinv31);
1465 felec = _mm256_mul_pd(velec,rinvsq31);
1469 /* Calculate temporary vectorial force */
1470 tx = _mm256_mul_pd(fscal,dx31);
1471 ty = _mm256_mul_pd(fscal,dy31);
1472 tz = _mm256_mul_pd(fscal,dz31);
1474 /* Update vectorial force */
1475 fix3 = _mm256_add_pd(fix3,tx);
1476 fiy3 = _mm256_add_pd(fiy3,ty);
1477 fiz3 = _mm256_add_pd(fiz3,tz);
1479 fjx1 = _mm256_add_pd(fjx1,tx);
1480 fjy1 = _mm256_add_pd(fjy1,ty);
1481 fjz1 = _mm256_add_pd(fjz1,tz);
1483 /**************************
1484 * CALCULATE INTERACTIONS *
1485 **************************/
1487 /* COULOMB ELECTROSTATICS */
1488 velec = _mm256_mul_pd(qq32,rinv32);
1489 felec = _mm256_mul_pd(velec,rinvsq32);
1493 /* Calculate temporary vectorial force */
1494 tx = _mm256_mul_pd(fscal,dx32);
1495 ty = _mm256_mul_pd(fscal,dy32);
1496 tz = _mm256_mul_pd(fscal,dz32);
1498 /* Update vectorial force */
1499 fix3 = _mm256_add_pd(fix3,tx);
1500 fiy3 = _mm256_add_pd(fiy3,ty);
1501 fiz3 = _mm256_add_pd(fiz3,tz);
1503 fjx2 = _mm256_add_pd(fjx2,tx);
1504 fjy2 = _mm256_add_pd(fjy2,ty);
1505 fjz2 = _mm256_add_pd(fjz2,tz);
1507 /**************************
1508 * CALCULATE INTERACTIONS *
1509 **************************/
1511 /* COULOMB ELECTROSTATICS */
1512 velec = _mm256_mul_pd(qq33,rinv33);
1513 felec = _mm256_mul_pd(velec,rinvsq33);
1517 /* Calculate temporary vectorial force */
1518 tx = _mm256_mul_pd(fscal,dx33);
1519 ty = _mm256_mul_pd(fscal,dy33);
1520 tz = _mm256_mul_pd(fscal,dz33);
1522 /* Update vectorial force */
1523 fix3 = _mm256_add_pd(fix3,tx);
1524 fiy3 = _mm256_add_pd(fiy3,ty);
1525 fiz3 = _mm256_add_pd(fiz3,tz);
1527 fjx3 = _mm256_add_pd(fjx3,tx);
1528 fjy3 = _mm256_add_pd(fjy3,ty);
1529 fjz3 = _mm256_add_pd(fjz3,tz);
1531 fjptrA = f+j_coord_offsetA;
1532 fjptrB = f+j_coord_offsetB;
1533 fjptrC = f+j_coord_offsetC;
1534 fjptrD = f+j_coord_offsetD;
1536 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1537 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1538 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1540 /* Inner loop uses 264 flops */
1543 if(jidx<j_index_end)
1546 /* Get j neighbor index, and coordinate index */
1547 jnrlistA = jjnr[jidx];
1548 jnrlistB = jjnr[jidx+1];
1549 jnrlistC = jjnr[jidx+2];
1550 jnrlistD = jjnr[jidx+3];
1551 /* Sign of each element will be negative for non-real atoms.
1552 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1553 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1555 tmpmask0 = gmx_mm_castsi128_pd(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1557 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1558 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1559 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1561 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1562 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1563 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1564 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1565 j_coord_offsetA = DIM*jnrA;
1566 j_coord_offsetB = DIM*jnrB;
1567 j_coord_offsetC = DIM*jnrC;
1568 j_coord_offsetD = DIM*jnrD;
1570 /* load j atom coordinates */
1571 gmx_mm256_load_4rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1572 x+j_coord_offsetC,x+j_coord_offsetD,
1573 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,
1574 &jy2,&jz2,&jx3,&jy3,&jz3);
1576 /* Calculate displacement vector */
1577 dx00 = _mm256_sub_pd(ix0,jx0);
1578 dy00 = _mm256_sub_pd(iy0,jy0);
1579 dz00 = _mm256_sub_pd(iz0,jz0);
1580 dx11 = _mm256_sub_pd(ix1,jx1);
1581 dy11 = _mm256_sub_pd(iy1,jy1);
1582 dz11 = _mm256_sub_pd(iz1,jz1);
1583 dx12 = _mm256_sub_pd(ix1,jx2);
1584 dy12 = _mm256_sub_pd(iy1,jy2);
1585 dz12 = _mm256_sub_pd(iz1,jz2);
1586 dx13 = _mm256_sub_pd(ix1,jx3);
1587 dy13 = _mm256_sub_pd(iy1,jy3);
1588 dz13 = _mm256_sub_pd(iz1,jz3);
1589 dx21 = _mm256_sub_pd(ix2,jx1);
1590 dy21 = _mm256_sub_pd(iy2,jy1);
1591 dz21 = _mm256_sub_pd(iz2,jz1);
1592 dx22 = _mm256_sub_pd(ix2,jx2);
1593 dy22 = _mm256_sub_pd(iy2,jy2);
1594 dz22 = _mm256_sub_pd(iz2,jz2);
1595 dx23 = _mm256_sub_pd(ix2,jx3);
1596 dy23 = _mm256_sub_pd(iy2,jy3);
1597 dz23 = _mm256_sub_pd(iz2,jz3);
1598 dx31 = _mm256_sub_pd(ix3,jx1);
1599 dy31 = _mm256_sub_pd(iy3,jy1);
1600 dz31 = _mm256_sub_pd(iz3,jz1);
1601 dx32 = _mm256_sub_pd(ix3,jx2);
1602 dy32 = _mm256_sub_pd(iy3,jy2);
1603 dz32 = _mm256_sub_pd(iz3,jz2);
1604 dx33 = _mm256_sub_pd(ix3,jx3);
1605 dy33 = _mm256_sub_pd(iy3,jy3);
1606 dz33 = _mm256_sub_pd(iz3,jz3);
1608 /* Calculate squared distance and things based on it */
1609 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1610 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1611 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1612 rsq13 = gmx_mm256_calc_rsq_pd(dx13,dy13,dz13);
1613 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1614 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1615 rsq23 = gmx_mm256_calc_rsq_pd(dx23,dy23,dz23);
1616 rsq31 = gmx_mm256_calc_rsq_pd(dx31,dy31,dz31);
1617 rsq32 = gmx_mm256_calc_rsq_pd(dx32,dy32,dz32);
1618 rsq33 = gmx_mm256_calc_rsq_pd(dx33,dy33,dz33);
1620 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1621 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1622 rinv13 = gmx_mm256_invsqrt_pd(rsq13);
1623 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1624 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1625 rinv23 = gmx_mm256_invsqrt_pd(rsq23);
1626 rinv31 = gmx_mm256_invsqrt_pd(rsq31);
1627 rinv32 = gmx_mm256_invsqrt_pd(rsq32);
1628 rinv33 = gmx_mm256_invsqrt_pd(rsq33);
1630 rinvsq00 = gmx_mm256_inv_pd(rsq00);
1631 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1632 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1633 rinvsq13 = _mm256_mul_pd(rinv13,rinv13);
1634 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1635 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1636 rinvsq23 = _mm256_mul_pd(rinv23,rinv23);
1637 rinvsq31 = _mm256_mul_pd(rinv31,rinv31);
1638 rinvsq32 = _mm256_mul_pd(rinv32,rinv32);
1639 rinvsq33 = _mm256_mul_pd(rinv33,rinv33);
1641 fjx0 = _mm256_setzero_pd();
1642 fjy0 = _mm256_setzero_pd();
1643 fjz0 = _mm256_setzero_pd();
1644 fjx1 = _mm256_setzero_pd();
1645 fjy1 = _mm256_setzero_pd();
1646 fjz1 = _mm256_setzero_pd();
1647 fjx2 = _mm256_setzero_pd();
1648 fjy2 = _mm256_setzero_pd();
1649 fjz2 = _mm256_setzero_pd();
1650 fjx3 = _mm256_setzero_pd();
1651 fjy3 = _mm256_setzero_pd();
1652 fjz3 = _mm256_setzero_pd();
1654 /**************************
1655 * CALCULATE INTERACTIONS *
1656 **************************/
1658 /* LENNARD-JONES DISPERSION/REPULSION */
1660 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1661 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1665 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1667 /* Calculate temporary vectorial force */
1668 tx = _mm256_mul_pd(fscal,dx00);
1669 ty = _mm256_mul_pd(fscal,dy00);
1670 tz = _mm256_mul_pd(fscal,dz00);
1672 /* Update vectorial force */
1673 fix0 = _mm256_add_pd(fix0,tx);
1674 fiy0 = _mm256_add_pd(fiy0,ty);
1675 fiz0 = _mm256_add_pd(fiz0,tz);
1677 fjx0 = _mm256_add_pd(fjx0,tx);
1678 fjy0 = _mm256_add_pd(fjy0,ty);
1679 fjz0 = _mm256_add_pd(fjz0,tz);
1681 /**************************
1682 * CALCULATE INTERACTIONS *
1683 **************************/
1685 /* COULOMB ELECTROSTATICS */
1686 velec = _mm256_mul_pd(qq11,rinv11);
1687 felec = _mm256_mul_pd(velec,rinvsq11);
1691 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1693 /* Calculate temporary vectorial force */
1694 tx = _mm256_mul_pd(fscal,dx11);
1695 ty = _mm256_mul_pd(fscal,dy11);
1696 tz = _mm256_mul_pd(fscal,dz11);
1698 /* Update vectorial force */
1699 fix1 = _mm256_add_pd(fix1,tx);
1700 fiy1 = _mm256_add_pd(fiy1,ty);
1701 fiz1 = _mm256_add_pd(fiz1,tz);
1703 fjx1 = _mm256_add_pd(fjx1,tx);
1704 fjy1 = _mm256_add_pd(fjy1,ty);
1705 fjz1 = _mm256_add_pd(fjz1,tz);
1707 /**************************
1708 * CALCULATE INTERACTIONS *
1709 **************************/
1711 /* COULOMB ELECTROSTATICS */
1712 velec = _mm256_mul_pd(qq12,rinv12);
1713 felec = _mm256_mul_pd(velec,rinvsq12);
1717 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1719 /* Calculate temporary vectorial force */
1720 tx = _mm256_mul_pd(fscal,dx12);
1721 ty = _mm256_mul_pd(fscal,dy12);
1722 tz = _mm256_mul_pd(fscal,dz12);
1724 /* Update vectorial force */
1725 fix1 = _mm256_add_pd(fix1,tx);
1726 fiy1 = _mm256_add_pd(fiy1,ty);
1727 fiz1 = _mm256_add_pd(fiz1,tz);
1729 fjx2 = _mm256_add_pd(fjx2,tx);
1730 fjy2 = _mm256_add_pd(fjy2,ty);
1731 fjz2 = _mm256_add_pd(fjz2,tz);
1733 /**************************
1734 * CALCULATE INTERACTIONS *
1735 **************************/
1737 /* COULOMB ELECTROSTATICS */
1738 velec = _mm256_mul_pd(qq13,rinv13);
1739 felec = _mm256_mul_pd(velec,rinvsq13);
1743 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1745 /* Calculate temporary vectorial force */
1746 tx = _mm256_mul_pd(fscal,dx13);
1747 ty = _mm256_mul_pd(fscal,dy13);
1748 tz = _mm256_mul_pd(fscal,dz13);
1750 /* Update vectorial force */
1751 fix1 = _mm256_add_pd(fix1,tx);
1752 fiy1 = _mm256_add_pd(fiy1,ty);
1753 fiz1 = _mm256_add_pd(fiz1,tz);
1755 fjx3 = _mm256_add_pd(fjx3,tx);
1756 fjy3 = _mm256_add_pd(fjy3,ty);
1757 fjz3 = _mm256_add_pd(fjz3,tz);
1759 /**************************
1760 * CALCULATE INTERACTIONS *
1761 **************************/
1763 /* COULOMB ELECTROSTATICS */
1764 velec = _mm256_mul_pd(qq21,rinv21);
1765 felec = _mm256_mul_pd(velec,rinvsq21);
1769 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1771 /* Calculate temporary vectorial force */
1772 tx = _mm256_mul_pd(fscal,dx21);
1773 ty = _mm256_mul_pd(fscal,dy21);
1774 tz = _mm256_mul_pd(fscal,dz21);
1776 /* Update vectorial force */
1777 fix2 = _mm256_add_pd(fix2,tx);
1778 fiy2 = _mm256_add_pd(fiy2,ty);
1779 fiz2 = _mm256_add_pd(fiz2,tz);
1781 fjx1 = _mm256_add_pd(fjx1,tx);
1782 fjy1 = _mm256_add_pd(fjy1,ty);
1783 fjz1 = _mm256_add_pd(fjz1,tz);
1785 /**************************
1786 * CALCULATE INTERACTIONS *
1787 **************************/
1789 /* COULOMB ELECTROSTATICS */
1790 velec = _mm256_mul_pd(qq22,rinv22);
1791 felec = _mm256_mul_pd(velec,rinvsq22);
1795 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1797 /* Calculate temporary vectorial force */
1798 tx = _mm256_mul_pd(fscal,dx22);
1799 ty = _mm256_mul_pd(fscal,dy22);
1800 tz = _mm256_mul_pd(fscal,dz22);
1802 /* Update vectorial force */
1803 fix2 = _mm256_add_pd(fix2,tx);
1804 fiy2 = _mm256_add_pd(fiy2,ty);
1805 fiz2 = _mm256_add_pd(fiz2,tz);
1807 fjx2 = _mm256_add_pd(fjx2,tx);
1808 fjy2 = _mm256_add_pd(fjy2,ty);
1809 fjz2 = _mm256_add_pd(fjz2,tz);
1811 /**************************
1812 * CALCULATE INTERACTIONS *
1813 **************************/
1815 /* COULOMB ELECTROSTATICS */
1816 velec = _mm256_mul_pd(qq23,rinv23);
1817 felec = _mm256_mul_pd(velec,rinvsq23);
1821 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1823 /* Calculate temporary vectorial force */
1824 tx = _mm256_mul_pd(fscal,dx23);
1825 ty = _mm256_mul_pd(fscal,dy23);
1826 tz = _mm256_mul_pd(fscal,dz23);
1828 /* Update vectorial force */
1829 fix2 = _mm256_add_pd(fix2,tx);
1830 fiy2 = _mm256_add_pd(fiy2,ty);
1831 fiz2 = _mm256_add_pd(fiz2,tz);
1833 fjx3 = _mm256_add_pd(fjx3,tx);
1834 fjy3 = _mm256_add_pd(fjy3,ty);
1835 fjz3 = _mm256_add_pd(fjz3,tz);
1837 /**************************
1838 * CALCULATE INTERACTIONS *
1839 **************************/
1841 /* COULOMB ELECTROSTATICS */
1842 velec = _mm256_mul_pd(qq31,rinv31);
1843 felec = _mm256_mul_pd(velec,rinvsq31);
1847 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1849 /* Calculate temporary vectorial force */
1850 tx = _mm256_mul_pd(fscal,dx31);
1851 ty = _mm256_mul_pd(fscal,dy31);
1852 tz = _mm256_mul_pd(fscal,dz31);
1854 /* Update vectorial force */
1855 fix3 = _mm256_add_pd(fix3,tx);
1856 fiy3 = _mm256_add_pd(fiy3,ty);
1857 fiz3 = _mm256_add_pd(fiz3,tz);
1859 fjx1 = _mm256_add_pd(fjx1,tx);
1860 fjy1 = _mm256_add_pd(fjy1,ty);
1861 fjz1 = _mm256_add_pd(fjz1,tz);
1863 /**************************
1864 * CALCULATE INTERACTIONS *
1865 **************************/
1867 /* COULOMB ELECTROSTATICS */
1868 velec = _mm256_mul_pd(qq32,rinv32);
1869 felec = _mm256_mul_pd(velec,rinvsq32);
1873 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1875 /* Calculate temporary vectorial force */
1876 tx = _mm256_mul_pd(fscal,dx32);
1877 ty = _mm256_mul_pd(fscal,dy32);
1878 tz = _mm256_mul_pd(fscal,dz32);
1880 /* Update vectorial force */
1881 fix3 = _mm256_add_pd(fix3,tx);
1882 fiy3 = _mm256_add_pd(fiy3,ty);
1883 fiz3 = _mm256_add_pd(fiz3,tz);
1885 fjx2 = _mm256_add_pd(fjx2,tx);
1886 fjy2 = _mm256_add_pd(fjy2,ty);
1887 fjz2 = _mm256_add_pd(fjz2,tz);
1889 /**************************
1890 * CALCULATE INTERACTIONS *
1891 **************************/
1893 /* COULOMB ELECTROSTATICS */
1894 velec = _mm256_mul_pd(qq33,rinv33);
1895 felec = _mm256_mul_pd(velec,rinvsq33);
1899 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1901 /* Calculate temporary vectorial force */
1902 tx = _mm256_mul_pd(fscal,dx33);
1903 ty = _mm256_mul_pd(fscal,dy33);
1904 tz = _mm256_mul_pd(fscal,dz33);
1906 /* Update vectorial force */
1907 fix3 = _mm256_add_pd(fix3,tx);
1908 fiy3 = _mm256_add_pd(fiy3,ty);
1909 fiz3 = _mm256_add_pd(fiz3,tz);
1911 fjx3 = _mm256_add_pd(fjx3,tx);
1912 fjy3 = _mm256_add_pd(fjy3,ty);
1913 fjz3 = _mm256_add_pd(fjz3,tz);
1915 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1916 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1917 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1918 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1920 gmx_mm256_decrement_4rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1921 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,
1922 fjx2,fjy2,fjz2,fjx3,fjy3,fjz3);
1924 /* Inner loop uses 264 flops */
1927 /* End of innermost loop */
1929 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1930 f+i_coord_offset,fshift+i_shift_offset);
1932 /* Increment number of inner iterations */
1933 inneriter += j_index_end - j_index_start;
1935 /* Outer loop uses 24 flops */
1938 /* Increment number of outer iterations */
1941 /* Update outer/inner flops */
1943 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4W4_F,outeriter*24 + inneriter*264);