2 * Note: this file was generated by the Gromacs avx_256_single kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_single.h"
34 #include "kernelutil_x86_avx_256_single.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_256_single
38 * Electrostatics interaction: ReactionField
39 * VdW interaction: LennardJones
40 * Geometry: Water3-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_VF_avx_256_single
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrE,jnrF,jnrG,jnrH;
62 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
63 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
64 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
65 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
66 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
68 real *shiftvec,*fshift,*x,*f;
69 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
71 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
72 real * vdwioffsetptr0;
73 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
74 real * vdwioffsetptr1;
75 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
76 real * vdwioffsetptr2;
77 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
79 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
81 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
83 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
86 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
89 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
90 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
91 __m256 dummy_mask,cutoff_mask;
92 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
93 __m256 one = _mm256_set1_ps(1.0);
94 __m256 two = _mm256_set1_ps(2.0);
100 jindex = nlist->jindex;
102 shiftidx = nlist->shift;
104 shiftvec = fr->shift_vec[0];
105 fshift = fr->fshift[0];
106 facel = _mm256_set1_ps(fr->epsfac);
107 charge = mdatoms->chargeA;
108 krf = _mm256_set1_ps(fr->ic->k_rf);
109 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
110 crf = _mm256_set1_ps(fr->ic->c_rf);
111 nvdwtype = fr->ntype;
113 vdwtype = mdatoms->typeA;
115 /* Setup water-specific parameters */
116 inr = nlist->iinr[0];
117 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
118 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
119 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
120 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
122 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
123 rcutoff_scalar = fr->rcoulomb;
124 rcutoff = _mm256_set1_ps(rcutoff_scalar);
125 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
127 sh_vdw_invrcut6 = _mm256_set1_ps(fr->ic->sh_invrc6);
128 rvdw = _mm256_set1_ps(fr->rvdw);
130 /* Avoid stupid compiler warnings */
131 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
144 for(iidx=0;iidx<4*DIM;iidx++)
149 /* Start outer loop over neighborlists */
150 for(iidx=0; iidx<nri; iidx++)
152 /* Load shift vector for this list */
153 i_shift_offset = DIM*shiftidx[iidx];
155 /* Load limits for loop over neighbors */
156 j_index_start = jindex[iidx];
157 j_index_end = jindex[iidx+1];
159 /* Get outer coordinate index */
161 i_coord_offset = DIM*inr;
163 /* Load i particle coords and add shift vector */
164 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
165 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
167 fix0 = _mm256_setzero_ps();
168 fiy0 = _mm256_setzero_ps();
169 fiz0 = _mm256_setzero_ps();
170 fix1 = _mm256_setzero_ps();
171 fiy1 = _mm256_setzero_ps();
172 fiz1 = _mm256_setzero_ps();
173 fix2 = _mm256_setzero_ps();
174 fiy2 = _mm256_setzero_ps();
175 fiz2 = _mm256_setzero_ps();
177 /* Reset potential sums */
178 velecsum = _mm256_setzero_ps();
179 vvdwsum = _mm256_setzero_ps();
181 /* Start inner kernel loop */
182 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
185 /* Get j neighbor index, and coordinate index */
194 j_coord_offsetA = DIM*jnrA;
195 j_coord_offsetB = DIM*jnrB;
196 j_coord_offsetC = DIM*jnrC;
197 j_coord_offsetD = DIM*jnrD;
198 j_coord_offsetE = DIM*jnrE;
199 j_coord_offsetF = DIM*jnrF;
200 j_coord_offsetG = DIM*jnrG;
201 j_coord_offsetH = DIM*jnrH;
203 /* load j atom coordinates */
204 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
205 x+j_coord_offsetC,x+j_coord_offsetD,
206 x+j_coord_offsetE,x+j_coord_offsetF,
207 x+j_coord_offsetG,x+j_coord_offsetH,
210 /* Calculate displacement vector */
211 dx00 = _mm256_sub_ps(ix0,jx0);
212 dy00 = _mm256_sub_ps(iy0,jy0);
213 dz00 = _mm256_sub_ps(iz0,jz0);
214 dx10 = _mm256_sub_ps(ix1,jx0);
215 dy10 = _mm256_sub_ps(iy1,jy0);
216 dz10 = _mm256_sub_ps(iz1,jz0);
217 dx20 = _mm256_sub_ps(ix2,jx0);
218 dy20 = _mm256_sub_ps(iy2,jy0);
219 dz20 = _mm256_sub_ps(iz2,jz0);
221 /* Calculate squared distance and things based on it */
222 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
223 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
224 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
226 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
227 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
228 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
230 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
231 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
232 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
234 /* Load parameters for j particles */
235 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
236 charge+jnrC+0,charge+jnrD+0,
237 charge+jnrE+0,charge+jnrF+0,
238 charge+jnrG+0,charge+jnrH+0);
239 vdwjidx0A = 2*vdwtype[jnrA+0];
240 vdwjidx0B = 2*vdwtype[jnrB+0];
241 vdwjidx0C = 2*vdwtype[jnrC+0];
242 vdwjidx0D = 2*vdwtype[jnrD+0];
243 vdwjidx0E = 2*vdwtype[jnrE+0];
244 vdwjidx0F = 2*vdwtype[jnrF+0];
245 vdwjidx0G = 2*vdwtype[jnrG+0];
246 vdwjidx0H = 2*vdwtype[jnrH+0];
248 fjx0 = _mm256_setzero_ps();
249 fjy0 = _mm256_setzero_ps();
250 fjz0 = _mm256_setzero_ps();
252 /**************************
253 * CALCULATE INTERACTIONS *
254 **************************/
256 if (gmx_mm256_any_lt(rsq00,rcutoff2))
259 /* Compute parameters for interactions between i and j atoms */
260 qq00 = _mm256_mul_ps(iq0,jq0);
261 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
262 vdwioffsetptr0+vdwjidx0B,
263 vdwioffsetptr0+vdwjidx0C,
264 vdwioffsetptr0+vdwjidx0D,
265 vdwioffsetptr0+vdwjidx0E,
266 vdwioffsetptr0+vdwjidx0F,
267 vdwioffsetptr0+vdwjidx0G,
268 vdwioffsetptr0+vdwjidx0H,
271 /* REACTION-FIELD ELECTROSTATICS */
272 velec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
273 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
275 /* LENNARD-JONES DISPERSION/REPULSION */
277 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
278 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
279 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
280 vvdw = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
281 _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
282 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
284 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
286 /* Update potential sum for this i atom from the interaction with this j atom. */
287 velec = _mm256_and_ps(velec,cutoff_mask);
288 velecsum = _mm256_add_ps(velecsum,velec);
289 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
290 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
292 fscal = _mm256_add_ps(felec,fvdw);
294 fscal = _mm256_and_ps(fscal,cutoff_mask);
296 /* Calculate temporary vectorial force */
297 tx = _mm256_mul_ps(fscal,dx00);
298 ty = _mm256_mul_ps(fscal,dy00);
299 tz = _mm256_mul_ps(fscal,dz00);
301 /* Update vectorial force */
302 fix0 = _mm256_add_ps(fix0,tx);
303 fiy0 = _mm256_add_ps(fiy0,ty);
304 fiz0 = _mm256_add_ps(fiz0,tz);
306 fjx0 = _mm256_add_ps(fjx0,tx);
307 fjy0 = _mm256_add_ps(fjy0,ty);
308 fjz0 = _mm256_add_ps(fjz0,tz);
312 /**************************
313 * CALCULATE INTERACTIONS *
314 **************************/
316 if (gmx_mm256_any_lt(rsq10,rcutoff2))
319 /* Compute parameters for interactions between i and j atoms */
320 qq10 = _mm256_mul_ps(iq1,jq0);
322 /* REACTION-FIELD ELECTROSTATICS */
323 velec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
324 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
326 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
328 /* Update potential sum for this i atom from the interaction with this j atom. */
329 velec = _mm256_and_ps(velec,cutoff_mask);
330 velecsum = _mm256_add_ps(velecsum,velec);
334 fscal = _mm256_and_ps(fscal,cutoff_mask);
336 /* Calculate temporary vectorial force */
337 tx = _mm256_mul_ps(fscal,dx10);
338 ty = _mm256_mul_ps(fscal,dy10);
339 tz = _mm256_mul_ps(fscal,dz10);
341 /* Update vectorial force */
342 fix1 = _mm256_add_ps(fix1,tx);
343 fiy1 = _mm256_add_ps(fiy1,ty);
344 fiz1 = _mm256_add_ps(fiz1,tz);
346 fjx0 = _mm256_add_ps(fjx0,tx);
347 fjy0 = _mm256_add_ps(fjy0,ty);
348 fjz0 = _mm256_add_ps(fjz0,tz);
352 /**************************
353 * CALCULATE INTERACTIONS *
354 **************************/
356 if (gmx_mm256_any_lt(rsq20,rcutoff2))
359 /* Compute parameters for interactions between i and j atoms */
360 qq20 = _mm256_mul_ps(iq2,jq0);
362 /* REACTION-FIELD ELECTROSTATICS */
363 velec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
364 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
366 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
368 /* Update potential sum for this i atom from the interaction with this j atom. */
369 velec = _mm256_and_ps(velec,cutoff_mask);
370 velecsum = _mm256_add_ps(velecsum,velec);
374 fscal = _mm256_and_ps(fscal,cutoff_mask);
376 /* Calculate temporary vectorial force */
377 tx = _mm256_mul_ps(fscal,dx20);
378 ty = _mm256_mul_ps(fscal,dy20);
379 tz = _mm256_mul_ps(fscal,dz20);
381 /* Update vectorial force */
382 fix2 = _mm256_add_ps(fix2,tx);
383 fiy2 = _mm256_add_ps(fiy2,ty);
384 fiz2 = _mm256_add_ps(fiz2,tz);
386 fjx0 = _mm256_add_ps(fjx0,tx);
387 fjy0 = _mm256_add_ps(fjy0,ty);
388 fjz0 = _mm256_add_ps(fjz0,tz);
392 fjptrA = f+j_coord_offsetA;
393 fjptrB = f+j_coord_offsetB;
394 fjptrC = f+j_coord_offsetC;
395 fjptrD = f+j_coord_offsetD;
396 fjptrE = f+j_coord_offsetE;
397 fjptrF = f+j_coord_offsetF;
398 fjptrG = f+j_coord_offsetG;
399 fjptrH = f+j_coord_offsetH;
401 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
403 /* Inner loop uses 129 flops */
409 /* Get j neighbor index, and coordinate index */
410 jnrlistA = jjnr[jidx];
411 jnrlistB = jjnr[jidx+1];
412 jnrlistC = jjnr[jidx+2];
413 jnrlistD = jjnr[jidx+3];
414 jnrlistE = jjnr[jidx+4];
415 jnrlistF = jjnr[jidx+5];
416 jnrlistG = jjnr[jidx+6];
417 jnrlistH = jjnr[jidx+7];
418 /* Sign of each element will be negative for non-real atoms.
419 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
420 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
422 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
423 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
425 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
426 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
427 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
428 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
429 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
430 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
431 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
432 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
433 j_coord_offsetA = DIM*jnrA;
434 j_coord_offsetB = DIM*jnrB;
435 j_coord_offsetC = DIM*jnrC;
436 j_coord_offsetD = DIM*jnrD;
437 j_coord_offsetE = DIM*jnrE;
438 j_coord_offsetF = DIM*jnrF;
439 j_coord_offsetG = DIM*jnrG;
440 j_coord_offsetH = DIM*jnrH;
442 /* load j atom coordinates */
443 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
444 x+j_coord_offsetC,x+j_coord_offsetD,
445 x+j_coord_offsetE,x+j_coord_offsetF,
446 x+j_coord_offsetG,x+j_coord_offsetH,
449 /* Calculate displacement vector */
450 dx00 = _mm256_sub_ps(ix0,jx0);
451 dy00 = _mm256_sub_ps(iy0,jy0);
452 dz00 = _mm256_sub_ps(iz0,jz0);
453 dx10 = _mm256_sub_ps(ix1,jx0);
454 dy10 = _mm256_sub_ps(iy1,jy0);
455 dz10 = _mm256_sub_ps(iz1,jz0);
456 dx20 = _mm256_sub_ps(ix2,jx0);
457 dy20 = _mm256_sub_ps(iy2,jy0);
458 dz20 = _mm256_sub_ps(iz2,jz0);
460 /* Calculate squared distance and things based on it */
461 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
462 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
463 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
465 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
466 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
467 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
469 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
470 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
471 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
473 /* Load parameters for j particles */
474 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
475 charge+jnrC+0,charge+jnrD+0,
476 charge+jnrE+0,charge+jnrF+0,
477 charge+jnrG+0,charge+jnrH+0);
478 vdwjidx0A = 2*vdwtype[jnrA+0];
479 vdwjidx0B = 2*vdwtype[jnrB+0];
480 vdwjidx0C = 2*vdwtype[jnrC+0];
481 vdwjidx0D = 2*vdwtype[jnrD+0];
482 vdwjidx0E = 2*vdwtype[jnrE+0];
483 vdwjidx0F = 2*vdwtype[jnrF+0];
484 vdwjidx0G = 2*vdwtype[jnrG+0];
485 vdwjidx0H = 2*vdwtype[jnrH+0];
487 fjx0 = _mm256_setzero_ps();
488 fjy0 = _mm256_setzero_ps();
489 fjz0 = _mm256_setzero_ps();
491 /**************************
492 * CALCULATE INTERACTIONS *
493 **************************/
495 if (gmx_mm256_any_lt(rsq00,rcutoff2))
498 /* Compute parameters for interactions between i and j atoms */
499 qq00 = _mm256_mul_ps(iq0,jq0);
500 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
501 vdwioffsetptr0+vdwjidx0B,
502 vdwioffsetptr0+vdwjidx0C,
503 vdwioffsetptr0+vdwjidx0D,
504 vdwioffsetptr0+vdwjidx0E,
505 vdwioffsetptr0+vdwjidx0F,
506 vdwioffsetptr0+vdwjidx0G,
507 vdwioffsetptr0+vdwjidx0H,
510 /* REACTION-FIELD ELECTROSTATICS */
511 velec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_add_ps(rinv00,_mm256_mul_ps(krf,rsq00)),crf));
512 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
514 /* LENNARD-JONES DISPERSION/REPULSION */
516 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
517 vvdw6 = _mm256_mul_ps(c6_00,rinvsix);
518 vvdw12 = _mm256_mul_ps(c12_00,_mm256_mul_ps(rinvsix,rinvsix));
519 vvdw = _mm256_sub_ps(_mm256_mul_ps( _mm256_sub_ps(vvdw12 , _mm256_mul_ps(c12_00,_mm256_mul_ps(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
520 _mm256_mul_ps( _mm256_sub_ps(vvdw6,_mm256_mul_ps(c6_00,sh_vdw_invrcut6)),one_sixth));
521 fvdw = _mm256_mul_ps(_mm256_sub_ps(vvdw12,vvdw6),rinvsq00);
523 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
525 /* Update potential sum for this i atom from the interaction with this j atom. */
526 velec = _mm256_and_ps(velec,cutoff_mask);
527 velec = _mm256_andnot_ps(dummy_mask,velec);
528 velecsum = _mm256_add_ps(velecsum,velec);
529 vvdw = _mm256_and_ps(vvdw,cutoff_mask);
530 vvdw = _mm256_andnot_ps(dummy_mask,vvdw);
531 vvdwsum = _mm256_add_ps(vvdwsum,vvdw);
533 fscal = _mm256_add_ps(felec,fvdw);
535 fscal = _mm256_and_ps(fscal,cutoff_mask);
537 fscal = _mm256_andnot_ps(dummy_mask,fscal);
539 /* Calculate temporary vectorial force */
540 tx = _mm256_mul_ps(fscal,dx00);
541 ty = _mm256_mul_ps(fscal,dy00);
542 tz = _mm256_mul_ps(fscal,dz00);
544 /* Update vectorial force */
545 fix0 = _mm256_add_ps(fix0,tx);
546 fiy0 = _mm256_add_ps(fiy0,ty);
547 fiz0 = _mm256_add_ps(fiz0,tz);
549 fjx0 = _mm256_add_ps(fjx0,tx);
550 fjy0 = _mm256_add_ps(fjy0,ty);
551 fjz0 = _mm256_add_ps(fjz0,tz);
555 /**************************
556 * CALCULATE INTERACTIONS *
557 **************************/
559 if (gmx_mm256_any_lt(rsq10,rcutoff2))
562 /* Compute parameters for interactions between i and j atoms */
563 qq10 = _mm256_mul_ps(iq1,jq0);
565 /* REACTION-FIELD ELECTROSTATICS */
566 velec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_add_ps(rinv10,_mm256_mul_ps(krf,rsq10)),crf));
567 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
569 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
571 /* Update potential sum for this i atom from the interaction with this j atom. */
572 velec = _mm256_and_ps(velec,cutoff_mask);
573 velec = _mm256_andnot_ps(dummy_mask,velec);
574 velecsum = _mm256_add_ps(velecsum,velec);
578 fscal = _mm256_and_ps(fscal,cutoff_mask);
580 fscal = _mm256_andnot_ps(dummy_mask,fscal);
582 /* Calculate temporary vectorial force */
583 tx = _mm256_mul_ps(fscal,dx10);
584 ty = _mm256_mul_ps(fscal,dy10);
585 tz = _mm256_mul_ps(fscal,dz10);
587 /* Update vectorial force */
588 fix1 = _mm256_add_ps(fix1,tx);
589 fiy1 = _mm256_add_ps(fiy1,ty);
590 fiz1 = _mm256_add_ps(fiz1,tz);
592 fjx0 = _mm256_add_ps(fjx0,tx);
593 fjy0 = _mm256_add_ps(fjy0,ty);
594 fjz0 = _mm256_add_ps(fjz0,tz);
598 /**************************
599 * CALCULATE INTERACTIONS *
600 **************************/
602 if (gmx_mm256_any_lt(rsq20,rcutoff2))
605 /* Compute parameters for interactions between i and j atoms */
606 qq20 = _mm256_mul_ps(iq2,jq0);
608 /* REACTION-FIELD ELECTROSTATICS */
609 velec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_add_ps(rinv20,_mm256_mul_ps(krf,rsq20)),crf));
610 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
612 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
614 /* Update potential sum for this i atom from the interaction with this j atom. */
615 velec = _mm256_and_ps(velec,cutoff_mask);
616 velec = _mm256_andnot_ps(dummy_mask,velec);
617 velecsum = _mm256_add_ps(velecsum,velec);
621 fscal = _mm256_and_ps(fscal,cutoff_mask);
623 fscal = _mm256_andnot_ps(dummy_mask,fscal);
625 /* Calculate temporary vectorial force */
626 tx = _mm256_mul_ps(fscal,dx20);
627 ty = _mm256_mul_ps(fscal,dy20);
628 tz = _mm256_mul_ps(fscal,dz20);
630 /* Update vectorial force */
631 fix2 = _mm256_add_ps(fix2,tx);
632 fiy2 = _mm256_add_ps(fiy2,ty);
633 fiz2 = _mm256_add_ps(fiz2,tz);
635 fjx0 = _mm256_add_ps(fjx0,tx);
636 fjy0 = _mm256_add_ps(fjy0,ty);
637 fjz0 = _mm256_add_ps(fjz0,tz);
641 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
642 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
643 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
644 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
645 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
646 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
647 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
648 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
650 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
652 /* Inner loop uses 129 flops */
655 /* End of innermost loop */
657 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
658 f+i_coord_offset,fshift+i_shift_offset);
661 /* Update potential energies */
662 gmx_mm256_update_1pot_ps(velecsum,kernel_data->energygrp_elec+ggid);
663 gmx_mm256_update_1pot_ps(vvdwsum,kernel_data->energygrp_vdw+ggid);
665 /* Increment number of inner iterations */
666 inneriter += j_index_end - j_index_start;
668 /* Outer loop uses 20 flops */
671 /* Increment number of outer iterations */
674 /* Update outer/inner flops */
676 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_VF,outeriter*20 + inneriter*129);
679 * Gromacs nonbonded kernel: nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_256_single
680 * Electrostatics interaction: ReactionField
681 * VdW interaction: LennardJones
682 * Geometry: Water3-Particle
683 * Calculate force/pot: Force
686 nb_kernel_ElecRFCut_VdwLJSh_GeomW3P1_F_avx_256_single
687 (t_nblist * gmx_restrict nlist,
688 rvec * gmx_restrict xx,
689 rvec * gmx_restrict ff,
690 t_forcerec * gmx_restrict fr,
691 t_mdatoms * gmx_restrict mdatoms,
692 nb_kernel_data_t * gmx_restrict kernel_data,
693 t_nrnb * gmx_restrict nrnb)
695 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
696 * just 0 for non-waters.
697 * Suffixes A,B,C,D,E,F,G,H refer to j loop unrolling done with AVX, e.g. for the eight different
698 * jnr indices corresponding to data put in the four positions in the SIMD register.
700 int i_shift_offset,i_coord_offset,outeriter,inneriter;
701 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
702 int jnrA,jnrB,jnrC,jnrD;
703 int jnrE,jnrF,jnrG,jnrH;
704 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
705 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
706 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
707 int j_coord_offsetE,j_coord_offsetF,j_coord_offsetG,j_coord_offsetH;
708 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
710 real *shiftvec,*fshift,*x,*f;
711 real *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
713 __m256 tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
714 real * vdwioffsetptr0;
715 __m256 ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
716 real * vdwioffsetptr1;
717 __m256 ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
718 real * vdwioffsetptr2;
719 __m256 ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
720 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
721 __m256 jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
722 __m256 dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
723 __m256 dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
724 __m256 dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
725 __m256 velec,felec,velecsum,facel,crf,krf,krf2;
728 __m256 rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
731 __m256 one_sixth = _mm256_set1_ps(1.0/6.0);
732 __m256 one_twelfth = _mm256_set1_ps(1.0/12.0);
733 __m256 dummy_mask,cutoff_mask;
734 __m256 signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
735 __m256 one = _mm256_set1_ps(1.0);
736 __m256 two = _mm256_set1_ps(2.0);
742 jindex = nlist->jindex;
744 shiftidx = nlist->shift;
746 shiftvec = fr->shift_vec[0];
747 fshift = fr->fshift[0];
748 facel = _mm256_set1_ps(fr->epsfac);
749 charge = mdatoms->chargeA;
750 krf = _mm256_set1_ps(fr->ic->k_rf);
751 krf2 = _mm256_set1_ps(fr->ic->k_rf*2.0);
752 crf = _mm256_set1_ps(fr->ic->c_rf);
753 nvdwtype = fr->ntype;
755 vdwtype = mdatoms->typeA;
757 /* Setup water-specific parameters */
758 inr = nlist->iinr[0];
759 iq0 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+0]));
760 iq1 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+1]));
761 iq2 = _mm256_mul_ps(facel,_mm256_set1_ps(charge[inr+2]));
762 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
764 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
765 rcutoff_scalar = fr->rcoulomb;
766 rcutoff = _mm256_set1_ps(rcutoff_scalar);
767 rcutoff2 = _mm256_mul_ps(rcutoff,rcutoff);
769 sh_vdw_invrcut6 = _mm256_set1_ps(fr->ic->sh_invrc6);
770 rvdw = _mm256_set1_ps(fr->rvdw);
772 /* Avoid stupid compiler warnings */
773 jnrA = jnrB = jnrC = jnrD = jnrE = jnrF = jnrG = jnrH = 0;
786 for(iidx=0;iidx<4*DIM;iidx++)
791 /* Start outer loop over neighborlists */
792 for(iidx=0; iidx<nri; iidx++)
794 /* Load shift vector for this list */
795 i_shift_offset = DIM*shiftidx[iidx];
797 /* Load limits for loop over neighbors */
798 j_index_start = jindex[iidx];
799 j_index_end = jindex[iidx+1];
801 /* Get outer coordinate index */
803 i_coord_offset = DIM*inr;
805 /* Load i particle coords and add shift vector */
806 gmx_mm256_load_shift_and_3rvec_broadcast_ps(shiftvec+i_shift_offset,x+i_coord_offset,
807 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2);
809 fix0 = _mm256_setzero_ps();
810 fiy0 = _mm256_setzero_ps();
811 fiz0 = _mm256_setzero_ps();
812 fix1 = _mm256_setzero_ps();
813 fiy1 = _mm256_setzero_ps();
814 fiz1 = _mm256_setzero_ps();
815 fix2 = _mm256_setzero_ps();
816 fiy2 = _mm256_setzero_ps();
817 fiz2 = _mm256_setzero_ps();
819 /* Start inner kernel loop */
820 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+7]>=0; jidx+=8)
823 /* Get j neighbor index, and coordinate index */
832 j_coord_offsetA = DIM*jnrA;
833 j_coord_offsetB = DIM*jnrB;
834 j_coord_offsetC = DIM*jnrC;
835 j_coord_offsetD = DIM*jnrD;
836 j_coord_offsetE = DIM*jnrE;
837 j_coord_offsetF = DIM*jnrF;
838 j_coord_offsetG = DIM*jnrG;
839 j_coord_offsetH = DIM*jnrH;
841 /* load j atom coordinates */
842 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
843 x+j_coord_offsetC,x+j_coord_offsetD,
844 x+j_coord_offsetE,x+j_coord_offsetF,
845 x+j_coord_offsetG,x+j_coord_offsetH,
848 /* Calculate displacement vector */
849 dx00 = _mm256_sub_ps(ix0,jx0);
850 dy00 = _mm256_sub_ps(iy0,jy0);
851 dz00 = _mm256_sub_ps(iz0,jz0);
852 dx10 = _mm256_sub_ps(ix1,jx0);
853 dy10 = _mm256_sub_ps(iy1,jy0);
854 dz10 = _mm256_sub_ps(iz1,jz0);
855 dx20 = _mm256_sub_ps(ix2,jx0);
856 dy20 = _mm256_sub_ps(iy2,jy0);
857 dz20 = _mm256_sub_ps(iz2,jz0);
859 /* Calculate squared distance and things based on it */
860 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
861 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
862 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
864 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
865 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
866 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
868 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
869 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
870 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
872 /* Load parameters for j particles */
873 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
874 charge+jnrC+0,charge+jnrD+0,
875 charge+jnrE+0,charge+jnrF+0,
876 charge+jnrG+0,charge+jnrH+0);
877 vdwjidx0A = 2*vdwtype[jnrA+0];
878 vdwjidx0B = 2*vdwtype[jnrB+0];
879 vdwjidx0C = 2*vdwtype[jnrC+0];
880 vdwjidx0D = 2*vdwtype[jnrD+0];
881 vdwjidx0E = 2*vdwtype[jnrE+0];
882 vdwjidx0F = 2*vdwtype[jnrF+0];
883 vdwjidx0G = 2*vdwtype[jnrG+0];
884 vdwjidx0H = 2*vdwtype[jnrH+0];
886 fjx0 = _mm256_setzero_ps();
887 fjy0 = _mm256_setzero_ps();
888 fjz0 = _mm256_setzero_ps();
890 /**************************
891 * CALCULATE INTERACTIONS *
892 **************************/
894 if (gmx_mm256_any_lt(rsq00,rcutoff2))
897 /* Compute parameters for interactions between i and j atoms */
898 qq00 = _mm256_mul_ps(iq0,jq0);
899 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
900 vdwioffsetptr0+vdwjidx0B,
901 vdwioffsetptr0+vdwjidx0C,
902 vdwioffsetptr0+vdwjidx0D,
903 vdwioffsetptr0+vdwjidx0E,
904 vdwioffsetptr0+vdwjidx0F,
905 vdwioffsetptr0+vdwjidx0G,
906 vdwioffsetptr0+vdwjidx0H,
909 /* REACTION-FIELD ELECTROSTATICS */
910 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
912 /* LENNARD-JONES DISPERSION/REPULSION */
914 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
915 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
917 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
919 fscal = _mm256_add_ps(felec,fvdw);
921 fscal = _mm256_and_ps(fscal,cutoff_mask);
923 /* Calculate temporary vectorial force */
924 tx = _mm256_mul_ps(fscal,dx00);
925 ty = _mm256_mul_ps(fscal,dy00);
926 tz = _mm256_mul_ps(fscal,dz00);
928 /* Update vectorial force */
929 fix0 = _mm256_add_ps(fix0,tx);
930 fiy0 = _mm256_add_ps(fiy0,ty);
931 fiz0 = _mm256_add_ps(fiz0,tz);
933 fjx0 = _mm256_add_ps(fjx0,tx);
934 fjy0 = _mm256_add_ps(fjy0,ty);
935 fjz0 = _mm256_add_ps(fjz0,tz);
939 /**************************
940 * CALCULATE INTERACTIONS *
941 **************************/
943 if (gmx_mm256_any_lt(rsq10,rcutoff2))
946 /* Compute parameters for interactions between i and j atoms */
947 qq10 = _mm256_mul_ps(iq1,jq0);
949 /* REACTION-FIELD ELECTROSTATICS */
950 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
952 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
956 fscal = _mm256_and_ps(fscal,cutoff_mask);
958 /* Calculate temporary vectorial force */
959 tx = _mm256_mul_ps(fscal,dx10);
960 ty = _mm256_mul_ps(fscal,dy10);
961 tz = _mm256_mul_ps(fscal,dz10);
963 /* Update vectorial force */
964 fix1 = _mm256_add_ps(fix1,tx);
965 fiy1 = _mm256_add_ps(fiy1,ty);
966 fiz1 = _mm256_add_ps(fiz1,tz);
968 fjx0 = _mm256_add_ps(fjx0,tx);
969 fjy0 = _mm256_add_ps(fjy0,ty);
970 fjz0 = _mm256_add_ps(fjz0,tz);
974 /**************************
975 * CALCULATE INTERACTIONS *
976 **************************/
978 if (gmx_mm256_any_lt(rsq20,rcutoff2))
981 /* Compute parameters for interactions between i and j atoms */
982 qq20 = _mm256_mul_ps(iq2,jq0);
984 /* REACTION-FIELD ELECTROSTATICS */
985 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
987 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
991 fscal = _mm256_and_ps(fscal,cutoff_mask);
993 /* Calculate temporary vectorial force */
994 tx = _mm256_mul_ps(fscal,dx20);
995 ty = _mm256_mul_ps(fscal,dy20);
996 tz = _mm256_mul_ps(fscal,dz20);
998 /* Update vectorial force */
999 fix2 = _mm256_add_ps(fix2,tx);
1000 fiy2 = _mm256_add_ps(fiy2,ty);
1001 fiz2 = _mm256_add_ps(fiz2,tz);
1003 fjx0 = _mm256_add_ps(fjx0,tx);
1004 fjy0 = _mm256_add_ps(fjy0,ty);
1005 fjz0 = _mm256_add_ps(fjz0,tz);
1009 fjptrA = f+j_coord_offsetA;
1010 fjptrB = f+j_coord_offsetB;
1011 fjptrC = f+j_coord_offsetC;
1012 fjptrD = f+j_coord_offsetD;
1013 fjptrE = f+j_coord_offsetE;
1014 fjptrF = f+j_coord_offsetF;
1015 fjptrG = f+j_coord_offsetG;
1016 fjptrH = f+j_coord_offsetH;
1018 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1020 /* Inner loop uses 100 flops */
1023 if(jidx<j_index_end)
1026 /* Get j neighbor index, and coordinate index */
1027 jnrlistA = jjnr[jidx];
1028 jnrlistB = jjnr[jidx+1];
1029 jnrlistC = jjnr[jidx+2];
1030 jnrlistD = jjnr[jidx+3];
1031 jnrlistE = jjnr[jidx+4];
1032 jnrlistF = jjnr[jidx+5];
1033 jnrlistG = jjnr[jidx+6];
1034 jnrlistH = jjnr[jidx+7];
1035 /* Sign of each element will be negative for non-real atoms.
1036 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1037 * so use it as val = _mm_andnot_ps(mask,val) to clear dummy entries.
1039 dummy_mask = gmx_mm256_set_m128(gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx+4)),_mm_setzero_si128())),
1040 gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128())));
1042 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1043 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1044 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1045 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1046 jnrE = (jnrlistE>=0) ? jnrlistE : 0;
1047 jnrF = (jnrlistF>=0) ? jnrlistF : 0;
1048 jnrG = (jnrlistG>=0) ? jnrlistG : 0;
1049 jnrH = (jnrlistH>=0) ? jnrlistH : 0;
1050 j_coord_offsetA = DIM*jnrA;
1051 j_coord_offsetB = DIM*jnrB;
1052 j_coord_offsetC = DIM*jnrC;
1053 j_coord_offsetD = DIM*jnrD;
1054 j_coord_offsetE = DIM*jnrE;
1055 j_coord_offsetF = DIM*jnrF;
1056 j_coord_offsetG = DIM*jnrG;
1057 j_coord_offsetH = DIM*jnrH;
1059 /* load j atom coordinates */
1060 gmx_mm256_load_1rvec_8ptr_swizzle_ps(x+j_coord_offsetA,x+j_coord_offsetB,
1061 x+j_coord_offsetC,x+j_coord_offsetD,
1062 x+j_coord_offsetE,x+j_coord_offsetF,
1063 x+j_coord_offsetG,x+j_coord_offsetH,
1066 /* Calculate displacement vector */
1067 dx00 = _mm256_sub_ps(ix0,jx0);
1068 dy00 = _mm256_sub_ps(iy0,jy0);
1069 dz00 = _mm256_sub_ps(iz0,jz0);
1070 dx10 = _mm256_sub_ps(ix1,jx0);
1071 dy10 = _mm256_sub_ps(iy1,jy0);
1072 dz10 = _mm256_sub_ps(iz1,jz0);
1073 dx20 = _mm256_sub_ps(ix2,jx0);
1074 dy20 = _mm256_sub_ps(iy2,jy0);
1075 dz20 = _mm256_sub_ps(iz2,jz0);
1077 /* Calculate squared distance and things based on it */
1078 rsq00 = gmx_mm256_calc_rsq_ps(dx00,dy00,dz00);
1079 rsq10 = gmx_mm256_calc_rsq_ps(dx10,dy10,dz10);
1080 rsq20 = gmx_mm256_calc_rsq_ps(dx20,dy20,dz20);
1082 rinv00 = gmx_mm256_invsqrt_ps(rsq00);
1083 rinv10 = gmx_mm256_invsqrt_ps(rsq10);
1084 rinv20 = gmx_mm256_invsqrt_ps(rsq20);
1086 rinvsq00 = _mm256_mul_ps(rinv00,rinv00);
1087 rinvsq10 = _mm256_mul_ps(rinv10,rinv10);
1088 rinvsq20 = _mm256_mul_ps(rinv20,rinv20);
1090 /* Load parameters for j particles */
1091 jq0 = gmx_mm256_load_8real_swizzle_ps(charge+jnrA+0,charge+jnrB+0,
1092 charge+jnrC+0,charge+jnrD+0,
1093 charge+jnrE+0,charge+jnrF+0,
1094 charge+jnrG+0,charge+jnrH+0);
1095 vdwjidx0A = 2*vdwtype[jnrA+0];
1096 vdwjidx0B = 2*vdwtype[jnrB+0];
1097 vdwjidx0C = 2*vdwtype[jnrC+0];
1098 vdwjidx0D = 2*vdwtype[jnrD+0];
1099 vdwjidx0E = 2*vdwtype[jnrE+0];
1100 vdwjidx0F = 2*vdwtype[jnrF+0];
1101 vdwjidx0G = 2*vdwtype[jnrG+0];
1102 vdwjidx0H = 2*vdwtype[jnrH+0];
1104 fjx0 = _mm256_setzero_ps();
1105 fjy0 = _mm256_setzero_ps();
1106 fjz0 = _mm256_setzero_ps();
1108 /**************************
1109 * CALCULATE INTERACTIONS *
1110 **************************/
1112 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1115 /* Compute parameters for interactions between i and j atoms */
1116 qq00 = _mm256_mul_ps(iq0,jq0);
1117 gmx_mm256_load_8pair_swizzle_ps(vdwioffsetptr0+vdwjidx0A,
1118 vdwioffsetptr0+vdwjidx0B,
1119 vdwioffsetptr0+vdwjidx0C,
1120 vdwioffsetptr0+vdwjidx0D,
1121 vdwioffsetptr0+vdwjidx0E,
1122 vdwioffsetptr0+vdwjidx0F,
1123 vdwioffsetptr0+vdwjidx0G,
1124 vdwioffsetptr0+vdwjidx0H,
1127 /* REACTION-FIELD ELECTROSTATICS */
1128 felec = _mm256_mul_ps(qq00,_mm256_sub_ps(_mm256_mul_ps(rinv00,rinvsq00),krf2));
1130 /* LENNARD-JONES DISPERSION/REPULSION */
1132 rinvsix = _mm256_mul_ps(_mm256_mul_ps(rinvsq00,rinvsq00),rinvsq00);
1133 fvdw = _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(c12_00,rinvsix),c6_00),_mm256_mul_ps(rinvsix,rinvsq00));
1135 cutoff_mask = _mm256_cmp_ps(rsq00,rcutoff2,_CMP_LT_OQ);
1137 fscal = _mm256_add_ps(felec,fvdw);
1139 fscal = _mm256_and_ps(fscal,cutoff_mask);
1141 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1143 /* Calculate temporary vectorial force */
1144 tx = _mm256_mul_ps(fscal,dx00);
1145 ty = _mm256_mul_ps(fscal,dy00);
1146 tz = _mm256_mul_ps(fscal,dz00);
1148 /* Update vectorial force */
1149 fix0 = _mm256_add_ps(fix0,tx);
1150 fiy0 = _mm256_add_ps(fiy0,ty);
1151 fiz0 = _mm256_add_ps(fiz0,tz);
1153 fjx0 = _mm256_add_ps(fjx0,tx);
1154 fjy0 = _mm256_add_ps(fjy0,ty);
1155 fjz0 = _mm256_add_ps(fjz0,tz);
1159 /**************************
1160 * CALCULATE INTERACTIONS *
1161 **************************/
1163 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1166 /* Compute parameters for interactions between i and j atoms */
1167 qq10 = _mm256_mul_ps(iq1,jq0);
1169 /* REACTION-FIELD ELECTROSTATICS */
1170 felec = _mm256_mul_ps(qq10,_mm256_sub_ps(_mm256_mul_ps(rinv10,rinvsq10),krf2));
1172 cutoff_mask = _mm256_cmp_ps(rsq10,rcutoff2,_CMP_LT_OQ);
1176 fscal = _mm256_and_ps(fscal,cutoff_mask);
1178 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1180 /* Calculate temporary vectorial force */
1181 tx = _mm256_mul_ps(fscal,dx10);
1182 ty = _mm256_mul_ps(fscal,dy10);
1183 tz = _mm256_mul_ps(fscal,dz10);
1185 /* Update vectorial force */
1186 fix1 = _mm256_add_ps(fix1,tx);
1187 fiy1 = _mm256_add_ps(fiy1,ty);
1188 fiz1 = _mm256_add_ps(fiz1,tz);
1190 fjx0 = _mm256_add_ps(fjx0,tx);
1191 fjy0 = _mm256_add_ps(fjy0,ty);
1192 fjz0 = _mm256_add_ps(fjz0,tz);
1196 /**************************
1197 * CALCULATE INTERACTIONS *
1198 **************************/
1200 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1203 /* Compute parameters for interactions between i and j atoms */
1204 qq20 = _mm256_mul_ps(iq2,jq0);
1206 /* REACTION-FIELD ELECTROSTATICS */
1207 felec = _mm256_mul_ps(qq20,_mm256_sub_ps(_mm256_mul_ps(rinv20,rinvsq20),krf2));
1209 cutoff_mask = _mm256_cmp_ps(rsq20,rcutoff2,_CMP_LT_OQ);
1213 fscal = _mm256_and_ps(fscal,cutoff_mask);
1215 fscal = _mm256_andnot_ps(dummy_mask,fscal);
1217 /* Calculate temporary vectorial force */
1218 tx = _mm256_mul_ps(fscal,dx20);
1219 ty = _mm256_mul_ps(fscal,dy20);
1220 tz = _mm256_mul_ps(fscal,dz20);
1222 /* Update vectorial force */
1223 fix2 = _mm256_add_ps(fix2,tx);
1224 fiy2 = _mm256_add_ps(fiy2,ty);
1225 fiz2 = _mm256_add_ps(fiz2,tz);
1227 fjx0 = _mm256_add_ps(fjx0,tx);
1228 fjy0 = _mm256_add_ps(fjy0,ty);
1229 fjz0 = _mm256_add_ps(fjz0,tz);
1233 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1234 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1235 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1236 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1237 fjptrE = (jnrlistE>=0) ? f+j_coord_offsetE : scratch;
1238 fjptrF = (jnrlistF>=0) ? f+j_coord_offsetF : scratch;
1239 fjptrG = (jnrlistG>=0) ? f+j_coord_offsetG : scratch;
1240 fjptrH = (jnrlistH>=0) ? f+j_coord_offsetH : scratch;
1242 gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(fjptrA,fjptrB,fjptrC,fjptrD,fjptrE,fjptrF,fjptrG,fjptrH,fjx0,fjy0,fjz0);
1244 /* Inner loop uses 100 flops */
1247 /* End of innermost loop */
1249 gmx_mm256_update_iforce_3atom_swizzle_ps(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,
1250 f+i_coord_offset,fshift+i_shift_offset);
1252 /* Increment number of inner iterations */
1253 inneriter += j_index_end - j_index_start;
1255 /* Outer loop uses 18 flops */
1258 /* Increment number of outer iterations */
1261 /* Update outer/inner flops */
1263 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W3_F,outeriter*18 + inneriter*100);