2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_double
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Water3
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
77 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
78 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
79 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
80 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
81 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
82 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
83 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
84 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
85 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
86 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
87 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
88 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
89 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
90 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
91 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
94 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
97 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
98 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
99 __m256d dummy_mask,cutoff_mask;
100 __m128 tmpmask0,tmpmask1;
101 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
102 __m256d one = _mm256_set1_pd(1.0);
103 __m256d two = _mm256_set1_pd(2.0);
109 jindex = nlist->jindex;
111 shiftidx = nlist->shift;
113 shiftvec = fr->shift_vec[0];
114 fshift = fr->fshift[0];
115 facel = _mm256_set1_pd(fr->epsfac);
116 charge = mdatoms->chargeA;
117 nvdwtype = fr->ntype;
119 vdwtype = mdatoms->typeA;
121 /* Setup water-specific parameters */
122 inr = nlist->iinr[0];
123 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
124 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
125 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
126 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
128 jq0 = _mm256_set1_pd(charge[inr+0]);
129 jq1 = _mm256_set1_pd(charge[inr+1]);
130 jq2 = _mm256_set1_pd(charge[inr+2]);
131 vdwjidx0A = 2*vdwtype[inr+0];
132 qq00 = _mm256_mul_pd(iq0,jq0);
133 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
134 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
135 qq01 = _mm256_mul_pd(iq0,jq1);
136 qq02 = _mm256_mul_pd(iq0,jq2);
137 qq10 = _mm256_mul_pd(iq1,jq0);
138 qq11 = _mm256_mul_pd(iq1,jq1);
139 qq12 = _mm256_mul_pd(iq1,jq2);
140 qq20 = _mm256_mul_pd(iq2,jq0);
141 qq21 = _mm256_mul_pd(iq2,jq1);
142 qq22 = _mm256_mul_pd(iq2,jq2);
144 /* Avoid stupid compiler warnings */
145 jnrA = jnrB = jnrC = jnrD = 0;
154 for(iidx=0;iidx<4*DIM;iidx++)
159 /* Start outer loop over neighborlists */
160 for(iidx=0; iidx<nri; iidx++)
162 /* Load shift vector for this list */
163 i_shift_offset = DIM*shiftidx[iidx];
165 /* Load limits for loop over neighbors */
166 j_index_start = jindex[iidx];
167 j_index_end = jindex[iidx+1];
169 /* Get outer coordinate index */
171 i_coord_offset = DIM*inr;
173 /* Load i particle coords and add shift vector */
174 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
175 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
177 fix0 = _mm256_setzero_pd();
178 fiy0 = _mm256_setzero_pd();
179 fiz0 = _mm256_setzero_pd();
180 fix1 = _mm256_setzero_pd();
181 fiy1 = _mm256_setzero_pd();
182 fiz1 = _mm256_setzero_pd();
183 fix2 = _mm256_setzero_pd();
184 fiy2 = _mm256_setzero_pd();
185 fiz2 = _mm256_setzero_pd();
187 /* Reset potential sums */
188 velecsum = _mm256_setzero_pd();
189 vvdwsum = _mm256_setzero_pd();
191 /* Start inner kernel loop */
192 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
195 /* Get j neighbor index, and coordinate index */
200 j_coord_offsetA = DIM*jnrA;
201 j_coord_offsetB = DIM*jnrB;
202 j_coord_offsetC = DIM*jnrC;
203 j_coord_offsetD = DIM*jnrD;
205 /* load j atom coordinates */
206 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
207 x+j_coord_offsetC,x+j_coord_offsetD,
208 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
210 /* Calculate displacement vector */
211 dx00 = _mm256_sub_pd(ix0,jx0);
212 dy00 = _mm256_sub_pd(iy0,jy0);
213 dz00 = _mm256_sub_pd(iz0,jz0);
214 dx01 = _mm256_sub_pd(ix0,jx1);
215 dy01 = _mm256_sub_pd(iy0,jy1);
216 dz01 = _mm256_sub_pd(iz0,jz1);
217 dx02 = _mm256_sub_pd(ix0,jx2);
218 dy02 = _mm256_sub_pd(iy0,jy2);
219 dz02 = _mm256_sub_pd(iz0,jz2);
220 dx10 = _mm256_sub_pd(ix1,jx0);
221 dy10 = _mm256_sub_pd(iy1,jy0);
222 dz10 = _mm256_sub_pd(iz1,jz0);
223 dx11 = _mm256_sub_pd(ix1,jx1);
224 dy11 = _mm256_sub_pd(iy1,jy1);
225 dz11 = _mm256_sub_pd(iz1,jz1);
226 dx12 = _mm256_sub_pd(ix1,jx2);
227 dy12 = _mm256_sub_pd(iy1,jy2);
228 dz12 = _mm256_sub_pd(iz1,jz2);
229 dx20 = _mm256_sub_pd(ix2,jx0);
230 dy20 = _mm256_sub_pd(iy2,jy0);
231 dz20 = _mm256_sub_pd(iz2,jz0);
232 dx21 = _mm256_sub_pd(ix2,jx1);
233 dy21 = _mm256_sub_pd(iy2,jy1);
234 dz21 = _mm256_sub_pd(iz2,jz1);
235 dx22 = _mm256_sub_pd(ix2,jx2);
236 dy22 = _mm256_sub_pd(iy2,jy2);
237 dz22 = _mm256_sub_pd(iz2,jz2);
239 /* Calculate squared distance and things based on it */
240 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
241 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
242 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
243 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
244 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
245 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
246 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
247 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
248 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
250 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
251 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
252 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
253 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
254 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
255 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
256 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
257 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
258 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
260 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
261 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
262 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
263 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
264 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
265 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
266 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
267 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
268 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
270 fjx0 = _mm256_setzero_pd();
271 fjy0 = _mm256_setzero_pd();
272 fjz0 = _mm256_setzero_pd();
273 fjx1 = _mm256_setzero_pd();
274 fjy1 = _mm256_setzero_pd();
275 fjz1 = _mm256_setzero_pd();
276 fjx2 = _mm256_setzero_pd();
277 fjy2 = _mm256_setzero_pd();
278 fjz2 = _mm256_setzero_pd();
280 /**************************
281 * CALCULATE INTERACTIONS *
282 **************************/
284 /* COULOMB ELECTROSTATICS */
285 velec = _mm256_mul_pd(qq00,rinv00);
286 felec = _mm256_mul_pd(velec,rinvsq00);
288 /* LENNARD-JONES DISPERSION/REPULSION */
290 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
291 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
292 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
293 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
294 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
296 /* Update potential sum for this i atom from the interaction with this j atom. */
297 velecsum = _mm256_add_pd(velecsum,velec);
298 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
300 fscal = _mm256_add_pd(felec,fvdw);
302 /* Calculate temporary vectorial force */
303 tx = _mm256_mul_pd(fscal,dx00);
304 ty = _mm256_mul_pd(fscal,dy00);
305 tz = _mm256_mul_pd(fscal,dz00);
307 /* Update vectorial force */
308 fix0 = _mm256_add_pd(fix0,tx);
309 fiy0 = _mm256_add_pd(fiy0,ty);
310 fiz0 = _mm256_add_pd(fiz0,tz);
312 fjx0 = _mm256_add_pd(fjx0,tx);
313 fjy0 = _mm256_add_pd(fjy0,ty);
314 fjz0 = _mm256_add_pd(fjz0,tz);
316 /**************************
317 * CALCULATE INTERACTIONS *
318 **************************/
320 /* COULOMB ELECTROSTATICS */
321 velec = _mm256_mul_pd(qq01,rinv01);
322 felec = _mm256_mul_pd(velec,rinvsq01);
324 /* Update potential sum for this i atom from the interaction with this j atom. */
325 velecsum = _mm256_add_pd(velecsum,velec);
329 /* Calculate temporary vectorial force */
330 tx = _mm256_mul_pd(fscal,dx01);
331 ty = _mm256_mul_pd(fscal,dy01);
332 tz = _mm256_mul_pd(fscal,dz01);
334 /* Update vectorial force */
335 fix0 = _mm256_add_pd(fix0,tx);
336 fiy0 = _mm256_add_pd(fiy0,ty);
337 fiz0 = _mm256_add_pd(fiz0,tz);
339 fjx1 = _mm256_add_pd(fjx1,tx);
340 fjy1 = _mm256_add_pd(fjy1,ty);
341 fjz1 = _mm256_add_pd(fjz1,tz);
343 /**************************
344 * CALCULATE INTERACTIONS *
345 **************************/
347 /* COULOMB ELECTROSTATICS */
348 velec = _mm256_mul_pd(qq02,rinv02);
349 felec = _mm256_mul_pd(velec,rinvsq02);
351 /* Update potential sum for this i atom from the interaction with this j atom. */
352 velecsum = _mm256_add_pd(velecsum,velec);
356 /* Calculate temporary vectorial force */
357 tx = _mm256_mul_pd(fscal,dx02);
358 ty = _mm256_mul_pd(fscal,dy02);
359 tz = _mm256_mul_pd(fscal,dz02);
361 /* Update vectorial force */
362 fix0 = _mm256_add_pd(fix0,tx);
363 fiy0 = _mm256_add_pd(fiy0,ty);
364 fiz0 = _mm256_add_pd(fiz0,tz);
366 fjx2 = _mm256_add_pd(fjx2,tx);
367 fjy2 = _mm256_add_pd(fjy2,ty);
368 fjz2 = _mm256_add_pd(fjz2,tz);
370 /**************************
371 * CALCULATE INTERACTIONS *
372 **************************/
374 /* COULOMB ELECTROSTATICS */
375 velec = _mm256_mul_pd(qq10,rinv10);
376 felec = _mm256_mul_pd(velec,rinvsq10);
378 /* Update potential sum for this i atom from the interaction with this j atom. */
379 velecsum = _mm256_add_pd(velecsum,velec);
383 /* Calculate temporary vectorial force */
384 tx = _mm256_mul_pd(fscal,dx10);
385 ty = _mm256_mul_pd(fscal,dy10);
386 tz = _mm256_mul_pd(fscal,dz10);
388 /* Update vectorial force */
389 fix1 = _mm256_add_pd(fix1,tx);
390 fiy1 = _mm256_add_pd(fiy1,ty);
391 fiz1 = _mm256_add_pd(fiz1,tz);
393 fjx0 = _mm256_add_pd(fjx0,tx);
394 fjy0 = _mm256_add_pd(fjy0,ty);
395 fjz0 = _mm256_add_pd(fjz0,tz);
397 /**************************
398 * CALCULATE INTERACTIONS *
399 **************************/
401 /* COULOMB ELECTROSTATICS */
402 velec = _mm256_mul_pd(qq11,rinv11);
403 felec = _mm256_mul_pd(velec,rinvsq11);
405 /* Update potential sum for this i atom from the interaction with this j atom. */
406 velecsum = _mm256_add_pd(velecsum,velec);
410 /* Calculate temporary vectorial force */
411 tx = _mm256_mul_pd(fscal,dx11);
412 ty = _mm256_mul_pd(fscal,dy11);
413 tz = _mm256_mul_pd(fscal,dz11);
415 /* Update vectorial force */
416 fix1 = _mm256_add_pd(fix1,tx);
417 fiy1 = _mm256_add_pd(fiy1,ty);
418 fiz1 = _mm256_add_pd(fiz1,tz);
420 fjx1 = _mm256_add_pd(fjx1,tx);
421 fjy1 = _mm256_add_pd(fjy1,ty);
422 fjz1 = _mm256_add_pd(fjz1,tz);
424 /**************************
425 * CALCULATE INTERACTIONS *
426 **************************/
428 /* COULOMB ELECTROSTATICS */
429 velec = _mm256_mul_pd(qq12,rinv12);
430 felec = _mm256_mul_pd(velec,rinvsq12);
432 /* Update potential sum for this i atom from the interaction with this j atom. */
433 velecsum = _mm256_add_pd(velecsum,velec);
437 /* Calculate temporary vectorial force */
438 tx = _mm256_mul_pd(fscal,dx12);
439 ty = _mm256_mul_pd(fscal,dy12);
440 tz = _mm256_mul_pd(fscal,dz12);
442 /* Update vectorial force */
443 fix1 = _mm256_add_pd(fix1,tx);
444 fiy1 = _mm256_add_pd(fiy1,ty);
445 fiz1 = _mm256_add_pd(fiz1,tz);
447 fjx2 = _mm256_add_pd(fjx2,tx);
448 fjy2 = _mm256_add_pd(fjy2,ty);
449 fjz2 = _mm256_add_pd(fjz2,tz);
451 /**************************
452 * CALCULATE INTERACTIONS *
453 **************************/
455 /* COULOMB ELECTROSTATICS */
456 velec = _mm256_mul_pd(qq20,rinv20);
457 felec = _mm256_mul_pd(velec,rinvsq20);
459 /* Update potential sum for this i atom from the interaction with this j atom. */
460 velecsum = _mm256_add_pd(velecsum,velec);
464 /* Calculate temporary vectorial force */
465 tx = _mm256_mul_pd(fscal,dx20);
466 ty = _mm256_mul_pd(fscal,dy20);
467 tz = _mm256_mul_pd(fscal,dz20);
469 /* Update vectorial force */
470 fix2 = _mm256_add_pd(fix2,tx);
471 fiy2 = _mm256_add_pd(fiy2,ty);
472 fiz2 = _mm256_add_pd(fiz2,tz);
474 fjx0 = _mm256_add_pd(fjx0,tx);
475 fjy0 = _mm256_add_pd(fjy0,ty);
476 fjz0 = _mm256_add_pd(fjz0,tz);
478 /**************************
479 * CALCULATE INTERACTIONS *
480 **************************/
482 /* COULOMB ELECTROSTATICS */
483 velec = _mm256_mul_pd(qq21,rinv21);
484 felec = _mm256_mul_pd(velec,rinvsq21);
486 /* Update potential sum for this i atom from the interaction with this j atom. */
487 velecsum = _mm256_add_pd(velecsum,velec);
491 /* Calculate temporary vectorial force */
492 tx = _mm256_mul_pd(fscal,dx21);
493 ty = _mm256_mul_pd(fscal,dy21);
494 tz = _mm256_mul_pd(fscal,dz21);
496 /* Update vectorial force */
497 fix2 = _mm256_add_pd(fix2,tx);
498 fiy2 = _mm256_add_pd(fiy2,ty);
499 fiz2 = _mm256_add_pd(fiz2,tz);
501 fjx1 = _mm256_add_pd(fjx1,tx);
502 fjy1 = _mm256_add_pd(fjy1,ty);
503 fjz1 = _mm256_add_pd(fjz1,tz);
505 /**************************
506 * CALCULATE INTERACTIONS *
507 **************************/
509 /* COULOMB ELECTROSTATICS */
510 velec = _mm256_mul_pd(qq22,rinv22);
511 felec = _mm256_mul_pd(velec,rinvsq22);
513 /* Update potential sum for this i atom from the interaction with this j atom. */
514 velecsum = _mm256_add_pd(velecsum,velec);
518 /* Calculate temporary vectorial force */
519 tx = _mm256_mul_pd(fscal,dx22);
520 ty = _mm256_mul_pd(fscal,dy22);
521 tz = _mm256_mul_pd(fscal,dz22);
523 /* Update vectorial force */
524 fix2 = _mm256_add_pd(fix2,tx);
525 fiy2 = _mm256_add_pd(fiy2,ty);
526 fiz2 = _mm256_add_pd(fiz2,tz);
528 fjx2 = _mm256_add_pd(fjx2,tx);
529 fjy2 = _mm256_add_pd(fjy2,ty);
530 fjz2 = _mm256_add_pd(fjz2,tz);
532 fjptrA = f+j_coord_offsetA;
533 fjptrB = f+j_coord_offsetB;
534 fjptrC = f+j_coord_offsetC;
535 fjptrD = f+j_coord_offsetD;
537 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
538 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
540 /* Inner loop uses 255 flops */
546 /* Get j neighbor index, and coordinate index */
547 jnrlistA = jjnr[jidx];
548 jnrlistB = jjnr[jidx+1];
549 jnrlistC = jjnr[jidx+2];
550 jnrlistD = jjnr[jidx+3];
551 /* Sign of each element will be negative for non-real atoms.
552 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
553 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
555 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
557 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
558 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
559 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
561 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
562 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
563 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
564 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
565 j_coord_offsetA = DIM*jnrA;
566 j_coord_offsetB = DIM*jnrB;
567 j_coord_offsetC = DIM*jnrC;
568 j_coord_offsetD = DIM*jnrD;
570 /* load j atom coordinates */
571 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
572 x+j_coord_offsetC,x+j_coord_offsetD,
573 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
575 /* Calculate displacement vector */
576 dx00 = _mm256_sub_pd(ix0,jx0);
577 dy00 = _mm256_sub_pd(iy0,jy0);
578 dz00 = _mm256_sub_pd(iz0,jz0);
579 dx01 = _mm256_sub_pd(ix0,jx1);
580 dy01 = _mm256_sub_pd(iy0,jy1);
581 dz01 = _mm256_sub_pd(iz0,jz1);
582 dx02 = _mm256_sub_pd(ix0,jx2);
583 dy02 = _mm256_sub_pd(iy0,jy2);
584 dz02 = _mm256_sub_pd(iz0,jz2);
585 dx10 = _mm256_sub_pd(ix1,jx0);
586 dy10 = _mm256_sub_pd(iy1,jy0);
587 dz10 = _mm256_sub_pd(iz1,jz0);
588 dx11 = _mm256_sub_pd(ix1,jx1);
589 dy11 = _mm256_sub_pd(iy1,jy1);
590 dz11 = _mm256_sub_pd(iz1,jz1);
591 dx12 = _mm256_sub_pd(ix1,jx2);
592 dy12 = _mm256_sub_pd(iy1,jy2);
593 dz12 = _mm256_sub_pd(iz1,jz2);
594 dx20 = _mm256_sub_pd(ix2,jx0);
595 dy20 = _mm256_sub_pd(iy2,jy0);
596 dz20 = _mm256_sub_pd(iz2,jz0);
597 dx21 = _mm256_sub_pd(ix2,jx1);
598 dy21 = _mm256_sub_pd(iy2,jy1);
599 dz21 = _mm256_sub_pd(iz2,jz1);
600 dx22 = _mm256_sub_pd(ix2,jx2);
601 dy22 = _mm256_sub_pd(iy2,jy2);
602 dz22 = _mm256_sub_pd(iz2,jz2);
604 /* Calculate squared distance and things based on it */
605 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
606 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
607 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
608 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
609 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
610 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
611 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
612 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
613 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
615 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
616 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
617 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
618 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
619 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
620 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
621 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
622 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
623 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
625 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
626 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
627 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
628 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
629 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
630 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
631 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
632 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
633 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
635 fjx0 = _mm256_setzero_pd();
636 fjy0 = _mm256_setzero_pd();
637 fjz0 = _mm256_setzero_pd();
638 fjx1 = _mm256_setzero_pd();
639 fjy1 = _mm256_setzero_pd();
640 fjz1 = _mm256_setzero_pd();
641 fjx2 = _mm256_setzero_pd();
642 fjy2 = _mm256_setzero_pd();
643 fjz2 = _mm256_setzero_pd();
645 /**************************
646 * CALCULATE INTERACTIONS *
647 **************************/
649 /* COULOMB ELECTROSTATICS */
650 velec = _mm256_mul_pd(qq00,rinv00);
651 felec = _mm256_mul_pd(velec,rinvsq00);
653 /* LENNARD-JONES DISPERSION/REPULSION */
655 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
656 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
657 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
658 vvdw = _mm256_sub_pd( _mm256_mul_pd(vvdw12,one_twelfth) , _mm256_mul_pd(vvdw6,one_sixth) );
659 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
661 /* Update potential sum for this i atom from the interaction with this j atom. */
662 velec = _mm256_andnot_pd(dummy_mask,velec);
663 velecsum = _mm256_add_pd(velecsum,velec);
664 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
665 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
667 fscal = _mm256_add_pd(felec,fvdw);
669 fscal = _mm256_andnot_pd(dummy_mask,fscal);
671 /* Calculate temporary vectorial force */
672 tx = _mm256_mul_pd(fscal,dx00);
673 ty = _mm256_mul_pd(fscal,dy00);
674 tz = _mm256_mul_pd(fscal,dz00);
676 /* Update vectorial force */
677 fix0 = _mm256_add_pd(fix0,tx);
678 fiy0 = _mm256_add_pd(fiy0,ty);
679 fiz0 = _mm256_add_pd(fiz0,tz);
681 fjx0 = _mm256_add_pd(fjx0,tx);
682 fjy0 = _mm256_add_pd(fjy0,ty);
683 fjz0 = _mm256_add_pd(fjz0,tz);
685 /**************************
686 * CALCULATE INTERACTIONS *
687 **************************/
689 /* COULOMB ELECTROSTATICS */
690 velec = _mm256_mul_pd(qq01,rinv01);
691 felec = _mm256_mul_pd(velec,rinvsq01);
693 /* Update potential sum for this i atom from the interaction with this j atom. */
694 velec = _mm256_andnot_pd(dummy_mask,velec);
695 velecsum = _mm256_add_pd(velecsum,velec);
699 fscal = _mm256_andnot_pd(dummy_mask,fscal);
701 /* Calculate temporary vectorial force */
702 tx = _mm256_mul_pd(fscal,dx01);
703 ty = _mm256_mul_pd(fscal,dy01);
704 tz = _mm256_mul_pd(fscal,dz01);
706 /* Update vectorial force */
707 fix0 = _mm256_add_pd(fix0,tx);
708 fiy0 = _mm256_add_pd(fiy0,ty);
709 fiz0 = _mm256_add_pd(fiz0,tz);
711 fjx1 = _mm256_add_pd(fjx1,tx);
712 fjy1 = _mm256_add_pd(fjy1,ty);
713 fjz1 = _mm256_add_pd(fjz1,tz);
715 /**************************
716 * CALCULATE INTERACTIONS *
717 **************************/
719 /* COULOMB ELECTROSTATICS */
720 velec = _mm256_mul_pd(qq02,rinv02);
721 felec = _mm256_mul_pd(velec,rinvsq02);
723 /* Update potential sum for this i atom from the interaction with this j atom. */
724 velec = _mm256_andnot_pd(dummy_mask,velec);
725 velecsum = _mm256_add_pd(velecsum,velec);
729 fscal = _mm256_andnot_pd(dummy_mask,fscal);
731 /* Calculate temporary vectorial force */
732 tx = _mm256_mul_pd(fscal,dx02);
733 ty = _mm256_mul_pd(fscal,dy02);
734 tz = _mm256_mul_pd(fscal,dz02);
736 /* Update vectorial force */
737 fix0 = _mm256_add_pd(fix0,tx);
738 fiy0 = _mm256_add_pd(fiy0,ty);
739 fiz0 = _mm256_add_pd(fiz0,tz);
741 fjx2 = _mm256_add_pd(fjx2,tx);
742 fjy2 = _mm256_add_pd(fjy2,ty);
743 fjz2 = _mm256_add_pd(fjz2,tz);
745 /**************************
746 * CALCULATE INTERACTIONS *
747 **************************/
749 /* COULOMB ELECTROSTATICS */
750 velec = _mm256_mul_pd(qq10,rinv10);
751 felec = _mm256_mul_pd(velec,rinvsq10);
753 /* Update potential sum for this i atom from the interaction with this j atom. */
754 velec = _mm256_andnot_pd(dummy_mask,velec);
755 velecsum = _mm256_add_pd(velecsum,velec);
759 fscal = _mm256_andnot_pd(dummy_mask,fscal);
761 /* Calculate temporary vectorial force */
762 tx = _mm256_mul_pd(fscal,dx10);
763 ty = _mm256_mul_pd(fscal,dy10);
764 tz = _mm256_mul_pd(fscal,dz10);
766 /* Update vectorial force */
767 fix1 = _mm256_add_pd(fix1,tx);
768 fiy1 = _mm256_add_pd(fiy1,ty);
769 fiz1 = _mm256_add_pd(fiz1,tz);
771 fjx0 = _mm256_add_pd(fjx0,tx);
772 fjy0 = _mm256_add_pd(fjy0,ty);
773 fjz0 = _mm256_add_pd(fjz0,tz);
775 /**************************
776 * CALCULATE INTERACTIONS *
777 **************************/
779 /* COULOMB ELECTROSTATICS */
780 velec = _mm256_mul_pd(qq11,rinv11);
781 felec = _mm256_mul_pd(velec,rinvsq11);
783 /* Update potential sum for this i atom from the interaction with this j atom. */
784 velec = _mm256_andnot_pd(dummy_mask,velec);
785 velecsum = _mm256_add_pd(velecsum,velec);
789 fscal = _mm256_andnot_pd(dummy_mask,fscal);
791 /* Calculate temporary vectorial force */
792 tx = _mm256_mul_pd(fscal,dx11);
793 ty = _mm256_mul_pd(fscal,dy11);
794 tz = _mm256_mul_pd(fscal,dz11);
796 /* Update vectorial force */
797 fix1 = _mm256_add_pd(fix1,tx);
798 fiy1 = _mm256_add_pd(fiy1,ty);
799 fiz1 = _mm256_add_pd(fiz1,tz);
801 fjx1 = _mm256_add_pd(fjx1,tx);
802 fjy1 = _mm256_add_pd(fjy1,ty);
803 fjz1 = _mm256_add_pd(fjz1,tz);
805 /**************************
806 * CALCULATE INTERACTIONS *
807 **************************/
809 /* COULOMB ELECTROSTATICS */
810 velec = _mm256_mul_pd(qq12,rinv12);
811 felec = _mm256_mul_pd(velec,rinvsq12);
813 /* Update potential sum for this i atom from the interaction with this j atom. */
814 velec = _mm256_andnot_pd(dummy_mask,velec);
815 velecsum = _mm256_add_pd(velecsum,velec);
819 fscal = _mm256_andnot_pd(dummy_mask,fscal);
821 /* Calculate temporary vectorial force */
822 tx = _mm256_mul_pd(fscal,dx12);
823 ty = _mm256_mul_pd(fscal,dy12);
824 tz = _mm256_mul_pd(fscal,dz12);
826 /* Update vectorial force */
827 fix1 = _mm256_add_pd(fix1,tx);
828 fiy1 = _mm256_add_pd(fiy1,ty);
829 fiz1 = _mm256_add_pd(fiz1,tz);
831 fjx2 = _mm256_add_pd(fjx2,tx);
832 fjy2 = _mm256_add_pd(fjy2,ty);
833 fjz2 = _mm256_add_pd(fjz2,tz);
835 /**************************
836 * CALCULATE INTERACTIONS *
837 **************************/
839 /* COULOMB ELECTROSTATICS */
840 velec = _mm256_mul_pd(qq20,rinv20);
841 felec = _mm256_mul_pd(velec,rinvsq20);
843 /* Update potential sum for this i atom from the interaction with this j atom. */
844 velec = _mm256_andnot_pd(dummy_mask,velec);
845 velecsum = _mm256_add_pd(velecsum,velec);
849 fscal = _mm256_andnot_pd(dummy_mask,fscal);
851 /* Calculate temporary vectorial force */
852 tx = _mm256_mul_pd(fscal,dx20);
853 ty = _mm256_mul_pd(fscal,dy20);
854 tz = _mm256_mul_pd(fscal,dz20);
856 /* Update vectorial force */
857 fix2 = _mm256_add_pd(fix2,tx);
858 fiy2 = _mm256_add_pd(fiy2,ty);
859 fiz2 = _mm256_add_pd(fiz2,tz);
861 fjx0 = _mm256_add_pd(fjx0,tx);
862 fjy0 = _mm256_add_pd(fjy0,ty);
863 fjz0 = _mm256_add_pd(fjz0,tz);
865 /**************************
866 * CALCULATE INTERACTIONS *
867 **************************/
869 /* COULOMB ELECTROSTATICS */
870 velec = _mm256_mul_pd(qq21,rinv21);
871 felec = _mm256_mul_pd(velec,rinvsq21);
873 /* Update potential sum for this i atom from the interaction with this j atom. */
874 velec = _mm256_andnot_pd(dummy_mask,velec);
875 velecsum = _mm256_add_pd(velecsum,velec);
879 fscal = _mm256_andnot_pd(dummy_mask,fscal);
881 /* Calculate temporary vectorial force */
882 tx = _mm256_mul_pd(fscal,dx21);
883 ty = _mm256_mul_pd(fscal,dy21);
884 tz = _mm256_mul_pd(fscal,dz21);
886 /* Update vectorial force */
887 fix2 = _mm256_add_pd(fix2,tx);
888 fiy2 = _mm256_add_pd(fiy2,ty);
889 fiz2 = _mm256_add_pd(fiz2,tz);
891 fjx1 = _mm256_add_pd(fjx1,tx);
892 fjy1 = _mm256_add_pd(fjy1,ty);
893 fjz1 = _mm256_add_pd(fjz1,tz);
895 /**************************
896 * CALCULATE INTERACTIONS *
897 **************************/
899 /* COULOMB ELECTROSTATICS */
900 velec = _mm256_mul_pd(qq22,rinv22);
901 felec = _mm256_mul_pd(velec,rinvsq22);
903 /* Update potential sum for this i atom from the interaction with this j atom. */
904 velec = _mm256_andnot_pd(dummy_mask,velec);
905 velecsum = _mm256_add_pd(velecsum,velec);
909 fscal = _mm256_andnot_pd(dummy_mask,fscal);
911 /* Calculate temporary vectorial force */
912 tx = _mm256_mul_pd(fscal,dx22);
913 ty = _mm256_mul_pd(fscal,dy22);
914 tz = _mm256_mul_pd(fscal,dz22);
916 /* Update vectorial force */
917 fix2 = _mm256_add_pd(fix2,tx);
918 fiy2 = _mm256_add_pd(fiy2,ty);
919 fiz2 = _mm256_add_pd(fiz2,tz);
921 fjx2 = _mm256_add_pd(fjx2,tx);
922 fjy2 = _mm256_add_pd(fjy2,ty);
923 fjz2 = _mm256_add_pd(fjz2,tz);
925 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
926 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
927 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
928 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
930 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
931 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
933 /* Inner loop uses 255 flops */
936 /* End of innermost loop */
938 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
939 f+i_coord_offset,fshift+i_shift_offset);
942 /* Update potential energies */
943 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
944 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
946 /* Increment number of inner iterations */
947 inneriter += j_index_end - j_index_start;
949 /* Outer loop uses 20 flops */
952 /* Increment number of outer iterations */
955 /* Update outer/inner flops */
957 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_VF,outeriter*20 + inneriter*255);
960 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_double
961 * Electrostatics interaction: Coulomb
962 * VdW interaction: LennardJones
963 * Geometry: Water3-Water3
964 * Calculate force/pot: Force
967 nb_kernel_ElecCoul_VdwLJ_GeomW3W3_F_avx_256_double
968 (t_nblist * gmx_restrict nlist,
969 rvec * gmx_restrict xx,
970 rvec * gmx_restrict ff,
971 t_forcerec * gmx_restrict fr,
972 t_mdatoms * gmx_restrict mdatoms,
973 nb_kernel_data_t * gmx_restrict kernel_data,
974 t_nrnb * gmx_restrict nrnb)
976 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
977 * just 0 for non-waters.
978 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
979 * jnr indices corresponding to data put in the four positions in the SIMD register.
981 int i_shift_offset,i_coord_offset,outeriter,inneriter;
982 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
983 int jnrA,jnrB,jnrC,jnrD;
984 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
985 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
986 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
987 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
989 real *shiftvec,*fshift,*x,*f;
990 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
992 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
993 real * vdwioffsetptr0;
994 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
995 real * vdwioffsetptr1;
996 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
997 real * vdwioffsetptr2;
998 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
999 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
1000 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
1001 int vdwjidx1A,vdwjidx1B,vdwjidx1C,vdwjidx1D;
1002 __m256d jx1,jy1,jz1,fjx1,fjy1,fjz1,jq1,isaj1;
1003 int vdwjidx2A,vdwjidx2B,vdwjidx2C,vdwjidx2D;
1004 __m256d jx2,jy2,jz2,fjx2,fjy2,fjz2,jq2,isaj2;
1005 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
1006 __m256d dx01,dy01,dz01,rsq01,rinv01,rinvsq01,r01,qq01,c6_01,c12_01;
1007 __m256d dx02,dy02,dz02,rsq02,rinv02,rinvsq02,r02,qq02,c6_02,c12_02;
1008 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
1009 __m256d dx11,dy11,dz11,rsq11,rinv11,rinvsq11,r11,qq11,c6_11,c12_11;
1010 __m256d dx12,dy12,dz12,rsq12,rinv12,rinvsq12,r12,qq12,c6_12,c12_12;
1011 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
1012 __m256d dx21,dy21,dz21,rsq21,rinv21,rinvsq21,r21,qq21,c6_21,c12_21;
1013 __m256d dx22,dy22,dz22,rsq22,rinv22,rinvsq22,r22,qq22,c6_22,c12_22;
1014 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
1017 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
1020 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
1021 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
1022 __m256d dummy_mask,cutoff_mask;
1023 __m128 tmpmask0,tmpmask1;
1024 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
1025 __m256d one = _mm256_set1_pd(1.0);
1026 __m256d two = _mm256_set1_pd(2.0);
1032 jindex = nlist->jindex;
1034 shiftidx = nlist->shift;
1036 shiftvec = fr->shift_vec[0];
1037 fshift = fr->fshift[0];
1038 facel = _mm256_set1_pd(fr->epsfac);
1039 charge = mdatoms->chargeA;
1040 nvdwtype = fr->ntype;
1041 vdwparam = fr->nbfp;
1042 vdwtype = mdatoms->typeA;
1044 /* Setup water-specific parameters */
1045 inr = nlist->iinr[0];
1046 iq0 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+0]));
1047 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
1048 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
1049 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
1051 jq0 = _mm256_set1_pd(charge[inr+0]);
1052 jq1 = _mm256_set1_pd(charge[inr+1]);
1053 jq2 = _mm256_set1_pd(charge[inr+2]);
1054 vdwjidx0A = 2*vdwtype[inr+0];
1055 qq00 = _mm256_mul_pd(iq0,jq0);
1056 c6_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A]);
1057 c12_00 = _mm256_set1_pd(vdwioffsetptr0[vdwjidx0A+1]);
1058 qq01 = _mm256_mul_pd(iq0,jq1);
1059 qq02 = _mm256_mul_pd(iq0,jq2);
1060 qq10 = _mm256_mul_pd(iq1,jq0);
1061 qq11 = _mm256_mul_pd(iq1,jq1);
1062 qq12 = _mm256_mul_pd(iq1,jq2);
1063 qq20 = _mm256_mul_pd(iq2,jq0);
1064 qq21 = _mm256_mul_pd(iq2,jq1);
1065 qq22 = _mm256_mul_pd(iq2,jq2);
1067 /* Avoid stupid compiler warnings */
1068 jnrA = jnrB = jnrC = jnrD = 0;
1069 j_coord_offsetA = 0;
1070 j_coord_offsetB = 0;
1071 j_coord_offsetC = 0;
1072 j_coord_offsetD = 0;
1077 for(iidx=0;iidx<4*DIM;iidx++)
1079 scratch[iidx] = 0.0;
1082 /* Start outer loop over neighborlists */
1083 for(iidx=0; iidx<nri; iidx++)
1085 /* Load shift vector for this list */
1086 i_shift_offset = DIM*shiftidx[iidx];
1088 /* Load limits for loop over neighbors */
1089 j_index_start = jindex[iidx];
1090 j_index_end = jindex[iidx+1];
1092 /* Get outer coordinate index */
1094 i_coord_offset = DIM*inr;
1096 /* Load i particle coords and add shift vector */
1097 gmx_mm256_load_shift_and_3rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
1098 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
1100 fix0 = _mm256_setzero_pd();
1101 fiy0 = _mm256_setzero_pd();
1102 fiz0 = _mm256_setzero_pd();
1103 fix1 = _mm256_setzero_pd();
1104 fiy1 = _mm256_setzero_pd();
1105 fiz1 = _mm256_setzero_pd();
1106 fix2 = _mm256_setzero_pd();
1107 fiy2 = _mm256_setzero_pd();
1108 fiz2 = _mm256_setzero_pd();
1110 /* Start inner kernel loop */
1111 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
1114 /* Get j neighbor index, and coordinate index */
1116 jnrB = jjnr[jidx+1];
1117 jnrC = jjnr[jidx+2];
1118 jnrD = jjnr[jidx+3];
1119 j_coord_offsetA = DIM*jnrA;
1120 j_coord_offsetB = DIM*jnrB;
1121 j_coord_offsetC = DIM*jnrC;
1122 j_coord_offsetD = DIM*jnrD;
1124 /* load j atom coordinates */
1125 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1126 x+j_coord_offsetC,x+j_coord_offsetD,
1127 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1129 /* Calculate displacement vector */
1130 dx00 = _mm256_sub_pd(ix0,jx0);
1131 dy00 = _mm256_sub_pd(iy0,jy0);
1132 dz00 = _mm256_sub_pd(iz0,jz0);
1133 dx01 = _mm256_sub_pd(ix0,jx1);
1134 dy01 = _mm256_sub_pd(iy0,jy1);
1135 dz01 = _mm256_sub_pd(iz0,jz1);
1136 dx02 = _mm256_sub_pd(ix0,jx2);
1137 dy02 = _mm256_sub_pd(iy0,jy2);
1138 dz02 = _mm256_sub_pd(iz0,jz2);
1139 dx10 = _mm256_sub_pd(ix1,jx0);
1140 dy10 = _mm256_sub_pd(iy1,jy0);
1141 dz10 = _mm256_sub_pd(iz1,jz0);
1142 dx11 = _mm256_sub_pd(ix1,jx1);
1143 dy11 = _mm256_sub_pd(iy1,jy1);
1144 dz11 = _mm256_sub_pd(iz1,jz1);
1145 dx12 = _mm256_sub_pd(ix1,jx2);
1146 dy12 = _mm256_sub_pd(iy1,jy2);
1147 dz12 = _mm256_sub_pd(iz1,jz2);
1148 dx20 = _mm256_sub_pd(ix2,jx0);
1149 dy20 = _mm256_sub_pd(iy2,jy0);
1150 dz20 = _mm256_sub_pd(iz2,jz0);
1151 dx21 = _mm256_sub_pd(ix2,jx1);
1152 dy21 = _mm256_sub_pd(iy2,jy1);
1153 dz21 = _mm256_sub_pd(iz2,jz1);
1154 dx22 = _mm256_sub_pd(ix2,jx2);
1155 dy22 = _mm256_sub_pd(iy2,jy2);
1156 dz22 = _mm256_sub_pd(iz2,jz2);
1158 /* Calculate squared distance and things based on it */
1159 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1160 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1161 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1162 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1163 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1164 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1165 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1166 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1167 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1169 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1170 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1171 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1172 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1173 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1174 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1175 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1176 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1177 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1179 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1180 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1181 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1182 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1183 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1184 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1185 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1186 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1187 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1189 fjx0 = _mm256_setzero_pd();
1190 fjy0 = _mm256_setzero_pd();
1191 fjz0 = _mm256_setzero_pd();
1192 fjx1 = _mm256_setzero_pd();
1193 fjy1 = _mm256_setzero_pd();
1194 fjz1 = _mm256_setzero_pd();
1195 fjx2 = _mm256_setzero_pd();
1196 fjy2 = _mm256_setzero_pd();
1197 fjz2 = _mm256_setzero_pd();
1199 /**************************
1200 * CALCULATE INTERACTIONS *
1201 **************************/
1203 /* COULOMB ELECTROSTATICS */
1204 velec = _mm256_mul_pd(qq00,rinv00);
1205 felec = _mm256_mul_pd(velec,rinvsq00);
1207 /* LENNARD-JONES DISPERSION/REPULSION */
1209 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1210 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1212 fscal = _mm256_add_pd(felec,fvdw);
1214 /* Calculate temporary vectorial force */
1215 tx = _mm256_mul_pd(fscal,dx00);
1216 ty = _mm256_mul_pd(fscal,dy00);
1217 tz = _mm256_mul_pd(fscal,dz00);
1219 /* Update vectorial force */
1220 fix0 = _mm256_add_pd(fix0,tx);
1221 fiy0 = _mm256_add_pd(fiy0,ty);
1222 fiz0 = _mm256_add_pd(fiz0,tz);
1224 fjx0 = _mm256_add_pd(fjx0,tx);
1225 fjy0 = _mm256_add_pd(fjy0,ty);
1226 fjz0 = _mm256_add_pd(fjz0,tz);
1228 /**************************
1229 * CALCULATE INTERACTIONS *
1230 **************************/
1232 /* COULOMB ELECTROSTATICS */
1233 velec = _mm256_mul_pd(qq01,rinv01);
1234 felec = _mm256_mul_pd(velec,rinvsq01);
1238 /* Calculate temporary vectorial force */
1239 tx = _mm256_mul_pd(fscal,dx01);
1240 ty = _mm256_mul_pd(fscal,dy01);
1241 tz = _mm256_mul_pd(fscal,dz01);
1243 /* Update vectorial force */
1244 fix0 = _mm256_add_pd(fix0,tx);
1245 fiy0 = _mm256_add_pd(fiy0,ty);
1246 fiz0 = _mm256_add_pd(fiz0,tz);
1248 fjx1 = _mm256_add_pd(fjx1,tx);
1249 fjy1 = _mm256_add_pd(fjy1,ty);
1250 fjz1 = _mm256_add_pd(fjz1,tz);
1252 /**************************
1253 * CALCULATE INTERACTIONS *
1254 **************************/
1256 /* COULOMB ELECTROSTATICS */
1257 velec = _mm256_mul_pd(qq02,rinv02);
1258 felec = _mm256_mul_pd(velec,rinvsq02);
1262 /* Calculate temporary vectorial force */
1263 tx = _mm256_mul_pd(fscal,dx02);
1264 ty = _mm256_mul_pd(fscal,dy02);
1265 tz = _mm256_mul_pd(fscal,dz02);
1267 /* Update vectorial force */
1268 fix0 = _mm256_add_pd(fix0,tx);
1269 fiy0 = _mm256_add_pd(fiy0,ty);
1270 fiz0 = _mm256_add_pd(fiz0,tz);
1272 fjx2 = _mm256_add_pd(fjx2,tx);
1273 fjy2 = _mm256_add_pd(fjy2,ty);
1274 fjz2 = _mm256_add_pd(fjz2,tz);
1276 /**************************
1277 * CALCULATE INTERACTIONS *
1278 **************************/
1280 /* COULOMB ELECTROSTATICS */
1281 velec = _mm256_mul_pd(qq10,rinv10);
1282 felec = _mm256_mul_pd(velec,rinvsq10);
1286 /* Calculate temporary vectorial force */
1287 tx = _mm256_mul_pd(fscal,dx10);
1288 ty = _mm256_mul_pd(fscal,dy10);
1289 tz = _mm256_mul_pd(fscal,dz10);
1291 /* Update vectorial force */
1292 fix1 = _mm256_add_pd(fix1,tx);
1293 fiy1 = _mm256_add_pd(fiy1,ty);
1294 fiz1 = _mm256_add_pd(fiz1,tz);
1296 fjx0 = _mm256_add_pd(fjx0,tx);
1297 fjy0 = _mm256_add_pd(fjy0,ty);
1298 fjz0 = _mm256_add_pd(fjz0,tz);
1300 /**************************
1301 * CALCULATE INTERACTIONS *
1302 **************************/
1304 /* COULOMB ELECTROSTATICS */
1305 velec = _mm256_mul_pd(qq11,rinv11);
1306 felec = _mm256_mul_pd(velec,rinvsq11);
1310 /* Calculate temporary vectorial force */
1311 tx = _mm256_mul_pd(fscal,dx11);
1312 ty = _mm256_mul_pd(fscal,dy11);
1313 tz = _mm256_mul_pd(fscal,dz11);
1315 /* Update vectorial force */
1316 fix1 = _mm256_add_pd(fix1,tx);
1317 fiy1 = _mm256_add_pd(fiy1,ty);
1318 fiz1 = _mm256_add_pd(fiz1,tz);
1320 fjx1 = _mm256_add_pd(fjx1,tx);
1321 fjy1 = _mm256_add_pd(fjy1,ty);
1322 fjz1 = _mm256_add_pd(fjz1,tz);
1324 /**************************
1325 * CALCULATE INTERACTIONS *
1326 **************************/
1328 /* COULOMB ELECTROSTATICS */
1329 velec = _mm256_mul_pd(qq12,rinv12);
1330 felec = _mm256_mul_pd(velec,rinvsq12);
1334 /* Calculate temporary vectorial force */
1335 tx = _mm256_mul_pd(fscal,dx12);
1336 ty = _mm256_mul_pd(fscal,dy12);
1337 tz = _mm256_mul_pd(fscal,dz12);
1339 /* Update vectorial force */
1340 fix1 = _mm256_add_pd(fix1,tx);
1341 fiy1 = _mm256_add_pd(fiy1,ty);
1342 fiz1 = _mm256_add_pd(fiz1,tz);
1344 fjx2 = _mm256_add_pd(fjx2,tx);
1345 fjy2 = _mm256_add_pd(fjy2,ty);
1346 fjz2 = _mm256_add_pd(fjz2,tz);
1348 /**************************
1349 * CALCULATE INTERACTIONS *
1350 **************************/
1352 /* COULOMB ELECTROSTATICS */
1353 velec = _mm256_mul_pd(qq20,rinv20);
1354 felec = _mm256_mul_pd(velec,rinvsq20);
1358 /* Calculate temporary vectorial force */
1359 tx = _mm256_mul_pd(fscal,dx20);
1360 ty = _mm256_mul_pd(fscal,dy20);
1361 tz = _mm256_mul_pd(fscal,dz20);
1363 /* Update vectorial force */
1364 fix2 = _mm256_add_pd(fix2,tx);
1365 fiy2 = _mm256_add_pd(fiy2,ty);
1366 fiz2 = _mm256_add_pd(fiz2,tz);
1368 fjx0 = _mm256_add_pd(fjx0,tx);
1369 fjy0 = _mm256_add_pd(fjy0,ty);
1370 fjz0 = _mm256_add_pd(fjz0,tz);
1372 /**************************
1373 * CALCULATE INTERACTIONS *
1374 **************************/
1376 /* COULOMB ELECTROSTATICS */
1377 velec = _mm256_mul_pd(qq21,rinv21);
1378 felec = _mm256_mul_pd(velec,rinvsq21);
1382 /* Calculate temporary vectorial force */
1383 tx = _mm256_mul_pd(fscal,dx21);
1384 ty = _mm256_mul_pd(fscal,dy21);
1385 tz = _mm256_mul_pd(fscal,dz21);
1387 /* Update vectorial force */
1388 fix2 = _mm256_add_pd(fix2,tx);
1389 fiy2 = _mm256_add_pd(fiy2,ty);
1390 fiz2 = _mm256_add_pd(fiz2,tz);
1392 fjx1 = _mm256_add_pd(fjx1,tx);
1393 fjy1 = _mm256_add_pd(fjy1,ty);
1394 fjz1 = _mm256_add_pd(fjz1,tz);
1396 /**************************
1397 * CALCULATE INTERACTIONS *
1398 **************************/
1400 /* COULOMB ELECTROSTATICS */
1401 velec = _mm256_mul_pd(qq22,rinv22);
1402 felec = _mm256_mul_pd(velec,rinvsq22);
1406 /* Calculate temporary vectorial force */
1407 tx = _mm256_mul_pd(fscal,dx22);
1408 ty = _mm256_mul_pd(fscal,dy22);
1409 tz = _mm256_mul_pd(fscal,dz22);
1411 /* Update vectorial force */
1412 fix2 = _mm256_add_pd(fix2,tx);
1413 fiy2 = _mm256_add_pd(fiy2,ty);
1414 fiz2 = _mm256_add_pd(fiz2,tz);
1416 fjx2 = _mm256_add_pd(fjx2,tx);
1417 fjy2 = _mm256_add_pd(fjy2,ty);
1418 fjz2 = _mm256_add_pd(fjz2,tz);
1420 fjptrA = f+j_coord_offsetA;
1421 fjptrB = f+j_coord_offsetB;
1422 fjptrC = f+j_coord_offsetC;
1423 fjptrD = f+j_coord_offsetD;
1425 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1426 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1428 /* Inner loop uses 241 flops */
1431 if(jidx<j_index_end)
1434 /* Get j neighbor index, and coordinate index */
1435 jnrlistA = jjnr[jidx];
1436 jnrlistB = jjnr[jidx+1];
1437 jnrlistC = jjnr[jidx+2];
1438 jnrlistD = jjnr[jidx+3];
1439 /* Sign of each element will be negative for non-real atoms.
1440 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1441 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1443 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1445 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1446 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1447 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1449 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1450 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1451 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1452 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1453 j_coord_offsetA = DIM*jnrA;
1454 j_coord_offsetB = DIM*jnrB;
1455 j_coord_offsetC = DIM*jnrC;
1456 j_coord_offsetD = DIM*jnrD;
1458 /* load j atom coordinates */
1459 gmx_mm256_load_3rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1460 x+j_coord_offsetC,x+j_coord_offsetD,
1461 &jx0,&jy0,&jz0,&jx1,&jy1,&jz1,&jx2,&jy2,&jz2);
1463 /* Calculate displacement vector */
1464 dx00 = _mm256_sub_pd(ix0,jx0);
1465 dy00 = _mm256_sub_pd(iy0,jy0);
1466 dz00 = _mm256_sub_pd(iz0,jz0);
1467 dx01 = _mm256_sub_pd(ix0,jx1);
1468 dy01 = _mm256_sub_pd(iy0,jy1);
1469 dz01 = _mm256_sub_pd(iz0,jz1);
1470 dx02 = _mm256_sub_pd(ix0,jx2);
1471 dy02 = _mm256_sub_pd(iy0,jy2);
1472 dz02 = _mm256_sub_pd(iz0,jz2);
1473 dx10 = _mm256_sub_pd(ix1,jx0);
1474 dy10 = _mm256_sub_pd(iy1,jy0);
1475 dz10 = _mm256_sub_pd(iz1,jz0);
1476 dx11 = _mm256_sub_pd(ix1,jx1);
1477 dy11 = _mm256_sub_pd(iy1,jy1);
1478 dz11 = _mm256_sub_pd(iz1,jz1);
1479 dx12 = _mm256_sub_pd(ix1,jx2);
1480 dy12 = _mm256_sub_pd(iy1,jy2);
1481 dz12 = _mm256_sub_pd(iz1,jz2);
1482 dx20 = _mm256_sub_pd(ix2,jx0);
1483 dy20 = _mm256_sub_pd(iy2,jy0);
1484 dz20 = _mm256_sub_pd(iz2,jz0);
1485 dx21 = _mm256_sub_pd(ix2,jx1);
1486 dy21 = _mm256_sub_pd(iy2,jy1);
1487 dz21 = _mm256_sub_pd(iz2,jz1);
1488 dx22 = _mm256_sub_pd(ix2,jx2);
1489 dy22 = _mm256_sub_pd(iy2,jy2);
1490 dz22 = _mm256_sub_pd(iz2,jz2);
1492 /* Calculate squared distance and things based on it */
1493 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1494 rsq01 = gmx_mm256_calc_rsq_pd(dx01,dy01,dz01);
1495 rsq02 = gmx_mm256_calc_rsq_pd(dx02,dy02,dz02);
1496 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1497 rsq11 = gmx_mm256_calc_rsq_pd(dx11,dy11,dz11);
1498 rsq12 = gmx_mm256_calc_rsq_pd(dx12,dy12,dz12);
1499 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1500 rsq21 = gmx_mm256_calc_rsq_pd(dx21,dy21,dz21);
1501 rsq22 = gmx_mm256_calc_rsq_pd(dx22,dy22,dz22);
1503 rinv00 = gmx_mm256_invsqrt_pd(rsq00);
1504 rinv01 = gmx_mm256_invsqrt_pd(rsq01);
1505 rinv02 = gmx_mm256_invsqrt_pd(rsq02);
1506 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1507 rinv11 = gmx_mm256_invsqrt_pd(rsq11);
1508 rinv12 = gmx_mm256_invsqrt_pd(rsq12);
1509 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1510 rinv21 = gmx_mm256_invsqrt_pd(rsq21);
1511 rinv22 = gmx_mm256_invsqrt_pd(rsq22);
1513 rinvsq00 = _mm256_mul_pd(rinv00,rinv00);
1514 rinvsq01 = _mm256_mul_pd(rinv01,rinv01);
1515 rinvsq02 = _mm256_mul_pd(rinv02,rinv02);
1516 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1517 rinvsq11 = _mm256_mul_pd(rinv11,rinv11);
1518 rinvsq12 = _mm256_mul_pd(rinv12,rinv12);
1519 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1520 rinvsq21 = _mm256_mul_pd(rinv21,rinv21);
1521 rinvsq22 = _mm256_mul_pd(rinv22,rinv22);
1523 fjx0 = _mm256_setzero_pd();
1524 fjy0 = _mm256_setzero_pd();
1525 fjz0 = _mm256_setzero_pd();
1526 fjx1 = _mm256_setzero_pd();
1527 fjy1 = _mm256_setzero_pd();
1528 fjz1 = _mm256_setzero_pd();
1529 fjx2 = _mm256_setzero_pd();
1530 fjy2 = _mm256_setzero_pd();
1531 fjz2 = _mm256_setzero_pd();
1533 /**************************
1534 * CALCULATE INTERACTIONS *
1535 **************************/
1537 /* COULOMB ELECTROSTATICS */
1538 velec = _mm256_mul_pd(qq00,rinv00);
1539 felec = _mm256_mul_pd(velec,rinvsq00);
1541 /* LENNARD-JONES DISPERSION/REPULSION */
1543 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1544 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1546 fscal = _mm256_add_pd(felec,fvdw);
1548 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1550 /* Calculate temporary vectorial force */
1551 tx = _mm256_mul_pd(fscal,dx00);
1552 ty = _mm256_mul_pd(fscal,dy00);
1553 tz = _mm256_mul_pd(fscal,dz00);
1555 /* Update vectorial force */
1556 fix0 = _mm256_add_pd(fix0,tx);
1557 fiy0 = _mm256_add_pd(fiy0,ty);
1558 fiz0 = _mm256_add_pd(fiz0,tz);
1560 fjx0 = _mm256_add_pd(fjx0,tx);
1561 fjy0 = _mm256_add_pd(fjy0,ty);
1562 fjz0 = _mm256_add_pd(fjz0,tz);
1564 /**************************
1565 * CALCULATE INTERACTIONS *
1566 **************************/
1568 /* COULOMB ELECTROSTATICS */
1569 velec = _mm256_mul_pd(qq01,rinv01);
1570 felec = _mm256_mul_pd(velec,rinvsq01);
1574 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1576 /* Calculate temporary vectorial force */
1577 tx = _mm256_mul_pd(fscal,dx01);
1578 ty = _mm256_mul_pd(fscal,dy01);
1579 tz = _mm256_mul_pd(fscal,dz01);
1581 /* Update vectorial force */
1582 fix0 = _mm256_add_pd(fix0,tx);
1583 fiy0 = _mm256_add_pd(fiy0,ty);
1584 fiz0 = _mm256_add_pd(fiz0,tz);
1586 fjx1 = _mm256_add_pd(fjx1,tx);
1587 fjy1 = _mm256_add_pd(fjy1,ty);
1588 fjz1 = _mm256_add_pd(fjz1,tz);
1590 /**************************
1591 * CALCULATE INTERACTIONS *
1592 **************************/
1594 /* COULOMB ELECTROSTATICS */
1595 velec = _mm256_mul_pd(qq02,rinv02);
1596 felec = _mm256_mul_pd(velec,rinvsq02);
1600 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1602 /* Calculate temporary vectorial force */
1603 tx = _mm256_mul_pd(fscal,dx02);
1604 ty = _mm256_mul_pd(fscal,dy02);
1605 tz = _mm256_mul_pd(fscal,dz02);
1607 /* Update vectorial force */
1608 fix0 = _mm256_add_pd(fix0,tx);
1609 fiy0 = _mm256_add_pd(fiy0,ty);
1610 fiz0 = _mm256_add_pd(fiz0,tz);
1612 fjx2 = _mm256_add_pd(fjx2,tx);
1613 fjy2 = _mm256_add_pd(fjy2,ty);
1614 fjz2 = _mm256_add_pd(fjz2,tz);
1616 /**************************
1617 * CALCULATE INTERACTIONS *
1618 **************************/
1620 /* COULOMB ELECTROSTATICS */
1621 velec = _mm256_mul_pd(qq10,rinv10);
1622 felec = _mm256_mul_pd(velec,rinvsq10);
1626 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1628 /* Calculate temporary vectorial force */
1629 tx = _mm256_mul_pd(fscal,dx10);
1630 ty = _mm256_mul_pd(fscal,dy10);
1631 tz = _mm256_mul_pd(fscal,dz10);
1633 /* Update vectorial force */
1634 fix1 = _mm256_add_pd(fix1,tx);
1635 fiy1 = _mm256_add_pd(fiy1,ty);
1636 fiz1 = _mm256_add_pd(fiz1,tz);
1638 fjx0 = _mm256_add_pd(fjx0,tx);
1639 fjy0 = _mm256_add_pd(fjy0,ty);
1640 fjz0 = _mm256_add_pd(fjz0,tz);
1642 /**************************
1643 * CALCULATE INTERACTIONS *
1644 **************************/
1646 /* COULOMB ELECTROSTATICS */
1647 velec = _mm256_mul_pd(qq11,rinv11);
1648 felec = _mm256_mul_pd(velec,rinvsq11);
1652 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1654 /* Calculate temporary vectorial force */
1655 tx = _mm256_mul_pd(fscal,dx11);
1656 ty = _mm256_mul_pd(fscal,dy11);
1657 tz = _mm256_mul_pd(fscal,dz11);
1659 /* Update vectorial force */
1660 fix1 = _mm256_add_pd(fix1,tx);
1661 fiy1 = _mm256_add_pd(fiy1,ty);
1662 fiz1 = _mm256_add_pd(fiz1,tz);
1664 fjx1 = _mm256_add_pd(fjx1,tx);
1665 fjy1 = _mm256_add_pd(fjy1,ty);
1666 fjz1 = _mm256_add_pd(fjz1,tz);
1668 /**************************
1669 * CALCULATE INTERACTIONS *
1670 **************************/
1672 /* COULOMB ELECTROSTATICS */
1673 velec = _mm256_mul_pd(qq12,rinv12);
1674 felec = _mm256_mul_pd(velec,rinvsq12);
1678 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1680 /* Calculate temporary vectorial force */
1681 tx = _mm256_mul_pd(fscal,dx12);
1682 ty = _mm256_mul_pd(fscal,dy12);
1683 tz = _mm256_mul_pd(fscal,dz12);
1685 /* Update vectorial force */
1686 fix1 = _mm256_add_pd(fix1,tx);
1687 fiy1 = _mm256_add_pd(fiy1,ty);
1688 fiz1 = _mm256_add_pd(fiz1,tz);
1690 fjx2 = _mm256_add_pd(fjx2,tx);
1691 fjy2 = _mm256_add_pd(fjy2,ty);
1692 fjz2 = _mm256_add_pd(fjz2,tz);
1694 /**************************
1695 * CALCULATE INTERACTIONS *
1696 **************************/
1698 /* COULOMB ELECTROSTATICS */
1699 velec = _mm256_mul_pd(qq20,rinv20);
1700 felec = _mm256_mul_pd(velec,rinvsq20);
1704 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1706 /* Calculate temporary vectorial force */
1707 tx = _mm256_mul_pd(fscal,dx20);
1708 ty = _mm256_mul_pd(fscal,dy20);
1709 tz = _mm256_mul_pd(fscal,dz20);
1711 /* Update vectorial force */
1712 fix2 = _mm256_add_pd(fix2,tx);
1713 fiy2 = _mm256_add_pd(fiy2,ty);
1714 fiz2 = _mm256_add_pd(fiz2,tz);
1716 fjx0 = _mm256_add_pd(fjx0,tx);
1717 fjy0 = _mm256_add_pd(fjy0,ty);
1718 fjz0 = _mm256_add_pd(fjz0,tz);
1720 /**************************
1721 * CALCULATE INTERACTIONS *
1722 **************************/
1724 /* COULOMB ELECTROSTATICS */
1725 velec = _mm256_mul_pd(qq21,rinv21);
1726 felec = _mm256_mul_pd(velec,rinvsq21);
1730 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1732 /* Calculate temporary vectorial force */
1733 tx = _mm256_mul_pd(fscal,dx21);
1734 ty = _mm256_mul_pd(fscal,dy21);
1735 tz = _mm256_mul_pd(fscal,dz21);
1737 /* Update vectorial force */
1738 fix2 = _mm256_add_pd(fix2,tx);
1739 fiy2 = _mm256_add_pd(fiy2,ty);
1740 fiz2 = _mm256_add_pd(fiz2,tz);
1742 fjx1 = _mm256_add_pd(fjx1,tx);
1743 fjy1 = _mm256_add_pd(fjy1,ty);
1744 fjz1 = _mm256_add_pd(fjz1,tz);
1746 /**************************
1747 * CALCULATE INTERACTIONS *
1748 **************************/
1750 /* COULOMB ELECTROSTATICS */
1751 velec = _mm256_mul_pd(qq22,rinv22);
1752 felec = _mm256_mul_pd(velec,rinvsq22);
1756 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1758 /* Calculate temporary vectorial force */
1759 tx = _mm256_mul_pd(fscal,dx22);
1760 ty = _mm256_mul_pd(fscal,dy22);
1761 tz = _mm256_mul_pd(fscal,dz22);
1763 /* Update vectorial force */
1764 fix2 = _mm256_add_pd(fix2,tx);
1765 fiy2 = _mm256_add_pd(fiy2,ty);
1766 fiz2 = _mm256_add_pd(fiz2,tz);
1768 fjx2 = _mm256_add_pd(fjx2,tx);
1769 fjy2 = _mm256_add_pd(fjy2,ty);
1770 fjz2 = _mm256_add_pd(fjz2,tz);
1772 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1773 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1774 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1775 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1777 gmx_mm256_decrement_3rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,
1778 fjx0,fjy0,fjz0,fjx1,fjy1,fjz1,fjx2,fjy2,fjz2);
1780 /* Inner loop uses 241 flops */
1783 /* End of innermost loop */
1785 gmx_mm256_update_iforce_3atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1786 f+i_coord_offset,fshift+i_shift_offset);
1788 /* Increment number of inner iterations */
1789 inneriter += j_index_end - j_index_start;
1791 /* Outer loop uses 18 flops */
1794 /* Increment number of outer iterations */
1797 /* Update outer/inner flops */
1799 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3W3_F,outeriter*18 + inneriter*241);