2 * Note: this file was generated by the Gromacs avx_256_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_256_double.h"
34 #include "kernelutil_x86_avx_256_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_256_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_256_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
60 int jnrA,jnrB,jnrC,jnrD;
61 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
62 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
63 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
64 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
66 real *shiftvec,*fshift,*x,*f;
67 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
69 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
70 real * vdwioffsetptr0;
71 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
72 real * vdwioffsetptr1;
73 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
74 real * vdwioffsetptr2;
75 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
76 real * vdwioffsetptr3;
77 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
78 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
79 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
80 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
81 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
82 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
83 __m256d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
84 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
87 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
90 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
91 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
93 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
94 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
96 __m256d dummy_mask,cutoff_mask;
97 __m128 tmpmask0,tmpmask1;
98 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
99 __m256d one = _mm256_set1_pd(1.0);
100 __m256d two = _mm256_set1_pd(2.0);
106 jindex = nlist->jindex;
108 shiftidx = nlist->shift;
110 shiftvec = fr->shift_vec[0];
111 fshift = fr->fshift[0];
112 facel = _mm256_set1_pd(fr->epsfac);
113 charge = mdatoms->chargeA;
114 nvdwtype = fr->ntype;
116 vdwtype = mdatoms->typeA;
118 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
119 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
120 beta2 = _mm256_mul_pd(beta,beta);
121 beta3 = _mm256_mul_pd(beta,beta2);
123 ewtab = fr->ic->tabq_coul_FDV0;
124 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
125 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
127 /* Setup water-specific parameters */
128 inr = nlist->iinr[0];
129 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
130 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
131 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
132 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
134 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
135 rcutoff_scalar = fr->rcoulomb;
136 rcutoff = _mm256_set1_pd(rcutoff_scalar);
137 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
139 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
140 rvdw = _mm256_set1_pd(fr->rvdw);
142 /* Avoid stupid compiler warnings */
143 jnrA = jnrB = jnrC = jnrD = 0;
152 for(iidx=0;iidx<4*DIM;iidx++)
157 /* Start outer loop over neighborlists */
158 for(iidx=0; iidx<nri; iidx++)
160 /* Load shift vector for this list */
161 i_shift_offset = DIM*shiftidx[iidx];
163 /* Load limits for loop over neighbors */
164 j_index_start = jindex[iidx];
165 j_index_end = jindex[iidx+1];
167 /* Get outer coordinate index */
169 i_coord_offset = DIM*inr;
171 /* Load i particle coords and add shift vector */
172 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
173 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
175 fix0 = _mm256_setzero_pd();
176 fiy0 = _mm256_setzero_pd();
177 fiz0 = _mm256_setzero_pd();
178 fix1 = _mm256_setzero_pd();
179 fiy1 = _mm256_setzero_pd();
180 fiz1 = _mm256_setzero_pd();
181 fix2 = _mm256_setzero_pd();
182 fiy2 = _mm256_setzero_pd();
183 fiz2 = _mm256_setzero_pd();
184 fix3 = _mm256_setzero_pd();
185 fiy3 = _mm256_setzero_pd();
186 fiz3 = _mm256_setzero_pd();
188 /* Reset potential sums */
189 velecsum = _mm256_setzero_pd();
190 vvdwsum = _mm256_setzero_pd();
192 /* Start inner kernel loop */
193 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
196 /* Get j neighbor index, and coordinate index */
201 j_coord_offsetA = DIM*jnrA;
202 j_coord_offsetB = DIM*jnrB;
203 j_coord_offsetC = DIM*jnrC;
204 j_coord_offsetD = DIM*jnrD;
206 /* load j atom coordinates */
207 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
208 x+j_coord_offsetC,x+j_coord_offsetD,
211 /* Calculate displacement vector */
212 dx00 = _mm256_sub_pd(ix0,jx0);
213 dy00 = _mm256_sub_pd(iy0,jy0);
214 dz00 = _mm256_sub_pd(iz0,jz0);
215 dx10 = _mm256_sub_pd(ix1,jx0);
216 dy10 = _mm256_sub_pd(iy1,jy0);
217 dz10 = _mm256_sub_pd(iz1,jz0);
218 dx20 = _mm256_sub_pd(ix2,jx0);
219 dy20 = _mm256_sub_pd(iy2,jy0);
220 dz20 = _mm256_sub_pd(iz2,jz0);
221 dx30 = _mm256_sub_pd(ix3,jx0);
222 dy30 = _mm256_sub_pd(iy3,jy0);
223 dz30 = _mm256_sub_pd(iz3,jz0);
225 /* Calculate squared distance and things based on it */
226 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
227 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
228 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
229 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
231 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
232 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
233 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
235 rinvsq00 = gmx_mm256_inv_pd(rsq00);
236 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
237 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
238 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
240 /* Load parameters for j particles */
241 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
242 charge+jnrC+0,charge+jnrD+0);
243 vdwjidx0A = 2*vdwtype[jnrA+0];
244 vdwjidx0B = 2*vdwtype[jnrB+0];
245 vdwjidx0C = 2*vdwtype[jnrC+0];
246 vdwjidx0D = 2*vdwtype[jnrD+0];
248 fjx0 = _mm256_setzero_pd();
249 fjy0 = _mm256_setzero_pd();
250 fjz0 = _mm256_setzero_pd();
252 /**************************
253 * CALCULATE INTERACTIONS *
254 **************************/
256 if (gmx_mm256_any_lt(rsq00,rcutoff2))
259 /* Compute parameters for interactions between i and j atoms */
260 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
261 vdwioffsetptr0+vdwjidx0B,
262 vdwioffsetptr0+vdwjidx0C,
263 vdwioffsetptr0+vdwjidx0D,
266 /* LENNARD-JONES DISPERSION/REPULSION */
268 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
269 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
270 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
271 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
272 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
273 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
275 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
277 /* Update potential sum for this i atom from the interaction with this j atom. */
278 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
279 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
283 fscal = _mm256_and_pd(fscal,cutoff_mask);
285 /* Calculate temporary vectorial force */
286 tx = _mm256_mul_pd(fscal,dx00);
287 ty = _mm256_mul_pd(fscal,dy00);
288 tz = _mm256_mul_pd(fscal,dz00);
290 /* Update vectorial force */
291 fix0 = _mm256_add_pd(fix0,tx);
292 fiy0 = _mm256_add_pd(fiy0,ty);
293 fiz0 = _mm256_add_pd(fiz0,tz);
295 fjx0 = _mm256_add_pd(fjx0,tx);
296 fjy0 = _mm256_add_pd(fjy0,ty);
297 fjz0 = _mm256_add_pd(fjz0,tz);
301 /**************************
302 * CALCULATE INTERACTIONS *
303 **************************/
305 if (gmx_mm256_any_lt(rsq10,rcutoff2))
308 r10 = _mm256_mul_pd(rsq10,rinv10);
310 /* Compute parameters for interactions between i and j atoms */
311 qq10 = _mm256_mul_pd(iq1,jq0);
313 /* EWALD ELECTROSTATICS */
315 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
316 ewrt = _mm256_mul_pd(r10,ewtabscale);
317 ewitab = _mm256_cvttpd_epi32(ewrt);
318 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
319 ewitab = _mm_slli_epi32(ewitab,2);
320 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
321 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
322 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
323 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
324 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
325 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
326 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
327 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
328 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
330 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
332 /* Update potential sum for this i atom from the interaction with this j atom. */
333 velec = _mm256_and_pd(velec,cutoff_mask);
334 velecsum = _mm256_add_pd(velecsum,velec);
338 fscal = _mm256_and_pd(fscal,cutoff_mask);
340 /* Calculate temporary vectorial force */
341 tx = _mm256_mul_pd(fscal,dx10);
342 ty = _mm256_mul_pd(fscal,dy10);
343 tz = _mm256_mul_pd(fscal,dz10);
345 /* Update vectorial force */
346 fix1 = _mm256_add_pd(fix1,tx);
347 fiy1 = _mm256_add_pd(fiy1,ty);
348 fiz1 = _mm256_add_pd(fiz1,tz);
350 fjx0 = _mm256_add_pd(fjx0,tx);
351 fjy0 = _mm256_add_pd(fjy0,ty);
352 fjz0 = _mm256_add_pd(fjz0,tz);
356 /**************************
357 * CALCULATE INTERACTIONS *
358 **************************/
360 if (gmx_mm256_any_lt(rsq20,rcutoff2))
363 r20 = _mm256_mul_pd(rsq20,rinv20);
365 /* Compute parameters for interactions between i and j atoms */
366 qq20 = _mm256_mul_pd(iq2,jq0);
368 /* EWALD ELECTROSTATICS */
370 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
371 ewrt = _mm256_mul_pd(r20,ewtabscale);
372 ewitab = _mm256_cvttpd_epi32(ewrt);
373 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
374 ewitab = _mm_slli_epi32(ewitab,2);
375 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
376 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
377 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
378 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
379 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
380 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
381 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
382 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
383 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
385 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
387 /* Update potential sum for this i atom from the interaction with this j atom. */
388 velec = _mm256_and_pd(velec,cutoff_mask);
389 velecsum = _mm256_add_pd(velecsum,velec);
393 fscal = _mm256_and_pd(fscal,cutoff_mask);
395 /* Calculate temporary vectorial force */
396 tx = _mm256_mul_pd(fscal,dx20);
397 ty = _mm256_mul_pd(fscal,dy20);
398 tz = _mm256_mul_pd(fscal,dz20);
400 /* Update vectorial force */
401 fix2 = _mm256_add_pd(fix2,tx);
402 fiy2 = _mm256_add_pd(fiy2,ty);
403 fiz2 = _mm256_add_pd(fiz2,tz);
405 fjx0 = _mm256_add_pd(fjx0,tx);
406 fjy0 = _mm256_add_pd(fjy0,ty);
407 fjz0 = _mm256_add_pd(fjz0,tz);
411 /**************************
412 * CALCULATE INTERACTIONS *
413 **************************/
415 if (gmx_mm256_any_lt(rsq30,rcutoff2))
418 r30 = _mm256_mul_pd(rsq30,rinv30);
420 /* Compute parameters for interactions between i and j atoms */
421 qq30 = _mm256_mul_pd(iq3,jq0);
423 /* EWALD ELECTROSTATICS */
425 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
426 ewrt = _mm256_mul_pd(r30,ewtabscale);
427 ewitab = _mm256_cvttpd_epi32(ewrt);
428 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
429 ewitab = _mm_slli_epi32(ewitab,2);
430 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
431 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
432 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
433 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
434 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
435 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
436 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
437 velec = _mm256_mul_pd(qq30,_mm256_sub_pd(_mm256_sub_pd(rinv30,sh_ewald),velec));
438 felec = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
440 cutoff_mask = _mm256_cmp_pd(rsq30,rcutoff2,_CMP_LT_OQ);
442 /* Update potential sum for this i atom from the interaction with this j atom. */
443 velec = _mm256_and_pd(velec,cutoff_mask);
444 velecsum = _mm256_add_pd(velecsum,velec);
448 fscal = _mm256_and_pd(fscal,cutoff_mask);
450 /* Calculate temporary vectorial force */
451 tx = _mm256_mul_pd(fscal,dx30);
452 ty = _mm256_mul_pd(fscal,dy30);
453 tz = _mm256_mul_pd(fscal,dz30);
455 /* Update vectorial force */
456 fix3 = _mm256_add_pd(fix3,tx);
457 fiy3 = _mm256_add_pd(fiy3,ty);
458 fiz3 = _mm256_add_pd(fiz3,tz);
460 fjx0 = _mm256_add_pd(fjx0,tx);
461 fjy0 = _mm256_add_pd(fjy0,ty);
462 fjz0 = _mm256_add_pd(fjz0,tz);
466 fjptrA = f+j_coord_offsetA;
467 fjptrB = f+j_coord_offsetB;
468 fjptrC = f+j_coord_offsetC;
469 fjptrD = f+j_coord_offsetD;
471 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
473 /* Inner loop uses 182 flops */
479 /* Get j neighbor index, and coordinate index */
480 jnrlistA = jjnr[jidx];
481 jnrlistB = jjnr[jidx+1];
482 jnrlistC = jjnr[jidx+2];
483 jnrlistD = jjnr[jidx+3];
484 /* Sign of each element will be negative for non-real atoms.
485 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
486 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
488 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
490 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
491 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
492 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
494 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
495 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
496 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
497 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
498 j_coord_offsetA = DIM*jnrA;
499 j_coord_offsetB = DIM*jnrB;
500 j_coord_offsetC = DIM*jnrC;
501 j_coord_offsetD = DIM*jnrD;
503 /* load j atom coordinates */
504 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
505 x+j_coord_offsetC,x+j_coord_offsetD,
508 /* Calculate displacement vector */
509 dx00 = _mm256_sub_pd(ix0,jx0);
510 dy00 = _mm256_sub_pd(iy0,jy0);
511 dz00 = _mm256_sub_pd(iz0,jz0);
512 dx10 = _mm256_sub_pd(ix1,jx0);
513 dy10 = _mm256_sub_pd(iy1,jy0);
514 dz10 = _mm256_sub_pd(iz1,jz0);
515 dx20 = _mm256_sub_pd(ix2,jx0);
516 dy20 = _mm256_sub_pd(iy2,jy0);
517 dz20 = _mm256_sub_pd(iz2,jz0);
518 dx30 = _mm256_sub_pd(ix3,jx0);
519 dy30 = _mm256_sub_pd(iy3,jy0);
520 dz30 = _mm256_sub_pd(iz3,jz0);
522 /* Calculate squared distance and things based on it */
523 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
524 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
525 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
526 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
528 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
529 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
530 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
532 rinvsq00 = gmx_mm256_inv_pd(rsq00);
533 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
534 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
535 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
537 /* Load parameters for j particles */
538 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
539 charge+jnrC+0,charge+jnrD+0);
540 vdwjidx0A = 2*vdwtype[jnrA+0];
541 vdwjidx0B = 2*vdwtype[jnrB+0];
542 vdwjidx0C = 2*vdwtype[jnrC+0];
543 vdwjidx0D = 2*vdwtype[jnrD+0];
545 fjx0 = _mm256_setzero_pd();
546 fjy0 = _mm256_setzero_pd();
547 fjz0 = _mm256_setzero_pd();
549 /**************************
550 * CALCULATE INTERACTIONS *
551 **************************/
553 if (gmx_mm256_any_lt(rsq00,rcutoff2))
556 /* Compute parameters for interactions between i and j atoms */
557 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
558 vdwioffsetptr0+vdwjidx0B,
559 vdwioffsetptr0+vdwjidx0C,
560 vdwioffsetptr0+vdwjidx0D,
563 /* LENNARD-JONES DISPERSION/REPULSION */
565 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
566 vvdw6 = _mm256_mul_pd(c6_00,rinvsix);
567 vvdw12 = _mm256_mul_pd(c12_00,_mm256_mul_pd(rinvsix,rinvsix));
568 vvdw = _mm256_sub_pd(_mm256_mul_pd( _mm256_sub_pd(vvdw12 , _mm256_mul_pd(c12_00,_mm256_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6))), one_twelfth) ,
569 _mm256_mul_pd( _mm256_sub_pd(vvdw6,_mm256_mul_pd(c6_00,sh_vdw_invrcut6)),one_sixth));
570 fvdw = _mm256_mul_pd(_mm256_sub_pd(vvdw12,vvdw6),rinvsq00);
572 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
574 /* Update potential sum for this i atom from the interaction with this j atom. */
575 vvdw = _mm256_and_pd(vvdw,cutoff_mask);
576 vvdw = _mm256_andnot_pd(dummy_mask,vvdw);
577 vvdwsum = _mm256_add_pd(vvdwsum,vvdw);
581 fscal = _mm256_and_pd(fscal,cutoff_mask);
583 fscal = _mm256_andnot_pd(dummy_mask,fscal);
585 /* Calculate temporary vectorial force */
586 tx = _mm256_mul_pd(fscal,dx00);
587 ty = _mm256_mul_pd(fscal,dy00);
588 tz = _mm256_mul_pd(fscal,dz00);
590 /* Update vectorial force */
591 fix0 = _mm256_add_pd(fix0,tx);
592 fiy0 = _mm256_add_pd(fiy0,ty);
593 fiz0 = _mm256_add_pd(fiz0,tz);
595 fjx0 = _mm256_add_pd(fjx0,tx);
596 fjy0 = _mm256_add_pd(fjy0,ty);
597 fjz0 = _mm256_add_pd(fjz0,tz);
601 /**************************
602 * CALCULATE INTERACTIONS *
603 **************************/
605 if (gmx_mm256_any_lt(rsq10,rcutoff2))
608 r10 = _mm256_mul_pd(rsq10,rinv10);
609 r10 = _mm256_andnot_pd(dummy_mask,r10);
611 /* Compute parameters for interactions between i and j atoms */
612 qq10 = _mm256_mul_pd(iq1,jq0);
614 /* EWALD ELECTROSTATICS */
616 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
617 ewrt = _mm256_mul_pd(r10,ewtabscale);
618 ewitab = _mm256_cvttpd_epi32(ewrt);
619 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
620 ewitab = _mm_slli_epi32(ewitab,2);
621 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
622 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
623 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
624 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
625 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
626 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
627 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
628 velec = _mm256_mul_pd(qq10,_mm256_sub_pd(_mm256_sub_pd(rinv10,sh_ewald),velec));
629 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
631 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
633 /* Update potential sum for this i atom from the interaction with this j atom. */
634 velec = _mm256_and_pd(velec,cutoff_mask);
635 velec = _mm256_andnot_pd(dummy_mask,velec);
636 velecsum = _mm256_add_pd(velecsum,velec);
640 fscal = _mm256_and_pd(fscal,cutoff_mask);
642 fscal = _mm256_andnot_pd(dummy_mask,fscal);
644 /* Calculate temporary vectorial force */
645 tx = _mm256_mul_pd(fscal,dx10);
646 ty = _mm256_mul_pd(fscal,dy10);
647 tz = _mm256_mul_pd(fscal,dz10);
649 /* Update vectorial force */
650 fix1 = _mm256_add_pd(fix1,tx);
651 fiy1 = _mm256_add_pd(fiy1,ty);
652 fiz1 = _mm256_add_pd(fiz1,tz);
654 fjx0 = _mm256_add_pd(fjx0,tx);
655 fjy0 = _mm256_add_pd(fjy0,ty);
656 fjz0 = _mm256_add_pd(fjz0,tz);
660 /**************************
661 * CALCULATE INTERACTIONS *
662 **************************/
664 if (gmx_mm256_any_lt(rsq20,rcutoff2))
667 r20 = _mm256_mul_pd(rsq20,rinv20);
668 r20 = _mm256_andnot_pd(dummy_mask,r20);
670 /* Compute parameters for interactions between i and j atoms */
671 qq20 = _mm256_mul_pd(iq2,jq0);
673 /* EWALD ELECTROSTATICS */
675 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
676 ewrt = _mm256_mul_pd(r20,ewtabscale);
677 ewitab = _mm256_cvttpd_epi32(ewrt);
678 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
679 ewitab = _mm_slli_epi32(ewitab,2);
680 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
681 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
682 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
683 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
684 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
685 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
686 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
687 velec = _mm256_mul_pd(qq20,_mm256_sub_pd(_mm256_sub_pd(rinv20,sh_ewald),velec));
688 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
690 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
692 /* Update potential sum for this i atom from the interaction with this j atom. */
693 velec = _mm256_and_pd(velec,cutoff_mask);
694 velec = _mm256_andnot_pd(dummy_mask,velec);
695 velecsum = _mm256_add_pd(velecsum,velec);
699 fscal = _mm256_and_pd(fscal,cutoff_mask);
701 fscal = _mm256_andnot_pd(dummy_mask,fscal);
703 /* Calculate temporary vectorial force */
704 tx = _mm256_mul_pd(fscal,dx20);
705 ty = _mm256_mul_pd(fscal,dy20);
706 tz = _mm256_mul_pd(fscal,dz20);
708 /* Update vectorial force */
709 fix2 = _mm256_add_pd(fix2,tx);
710 fiy2 = _mm256_add_pd(fiy2,ty);
711 fiz2 = _mm256_add_pd(fiz2,tz);
713 fjx0 = _mm256_add_pd(fjx0,tx);
714 fjy0 = _mm256_add_pd(fjy0,ty);
715 fjz0 = _mm256_add_pd(fjz0,tz);
719 /**************************
720 * CALCULATE INTERACTIONS *
721 **************************/
723 if (gmx_mm256_any_lt(rsq30,rcutoff2))
726 r30 = _mm256_mul_pd(rsq30,rinv30);
727 r30 = _mm256_andnot_pd(dummy_mask,r30);
729 /* Compute parameters for interactions between i and j atoms */
730 qq30 = _mm256_mul_pd(iq3,jq0);
732 /* EWALD ELECTROSTATICS */
734 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
735 ewrt = _mm256_mul_pd(r30,ewtabscale);
736 ewitab = _mm256_cvttpd_epi32(ewrt);
737 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
738 ewitab = _mm_slli_epi32(ewitab,2);
739 ewtabF = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
740 ewtabD = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
741 ewtabV = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,2) );
742 ewtabFn = _mm256_load_pd( ewtab + _mm_extract_epi32(ewitab,3) );
743 GMX_MM256_FULLTRANSPOSE4_PD(ewtabF,ewtabD,ewtabV,ewtabFn);
744 felec = _mm256_add_pd(ewtabF,_mm256_mul_pd(eweps,ewtabD));
745 velec = _mm256_sub_pd(ewtabV,_mm256_mul_pd(_mm256_mul_pd(ewtabhalfspace,eweps),_mm256_add_pd(ewtabF,felec)));
746 velec = _mm256_mul_pd(qq30,_mm256_sub_pd(_mm256_sub_pd(rinv30,sh_ewald),velec));
747 felec = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
749 cutoff_mask = _mm256_cmp_pd(rsq30,rcutoff2,_CMP_LT_OQ);
751 /* Update potential sum for this i atom from the interaction with this j atom. */
752 velec = _mm256_and_pd(velec,cutoff_mask);
753 velec = _mm256_andnot_pd(dummy_mask,velec);
754 velecsum = _mm256_add_pd(velecsum,velec);
758 fscal = _mm256_and_pd(fscal,cutoff_mask);
760 fscal = _mm256_andnot_pd(dummy_mask,fscal);
762 /* Calculate temporary vectorial force */
763 tx = _mm256_mul_pd(fscal,dx30);
764 ty = _mm256_mul_pd(fscal,dy30);
765 tz = _mm256_mul_pd(fscal,dz30);
767 /* Update vectorial force */
768 fix3 = _mm256_add_pd(fix3,tx);
769 fiy3 = _mm256_add_pd(fiy3,ty);
770 fiz3 = _mm256_add_pd(fiz3,tz);
772 fjx0 = _mm256_add_pd(fjx0,tx);
773 fjy0 = _mm256_add_pd(fjy0,ty);
774 fjz0 = _mm256_add_pd(fjz0,tz);
778 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
779 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
780 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
781 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
783 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
785 /* Inner loop uses 185 flops */
788 /* End of innermost loop */
790 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
791 f+i_coord_offset,fshift+i_shift_offset);
794 /* Update potential energies */
795 gmx_mm256_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
796 gmx_mm256_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
798 /* Increment number of inner iterations */
799 inneriter += j_index_end - j_index_start;
801 /* Outer loop uses 26 flops */
804 /* Increment number of outer iterations */
807 /* Update outer/inner flops */
809 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*185);
812 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_256_double
813 * Electrostatics interaction: Ewald
814 * VdW interaction: LennardJones
815 * Geometry: Water4-Particle
816 * Calculate force/pot: Force
819 nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_256_double
820 (t_nblist * gmx_restrict nlist,
821 rvec * gmx_restrict xx,
822 rvec * gmx_restrict ff,
823 t_forcerec * gmx_restrict fr,
824 t_mdatoms * gmx_restrict mdatoms,
825 nb_kernel_data_t * gmx_restrict kernel_data,
826 t_nrnb * gmx_restrict nrnb)
828 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
829 * just 0 for non-waters.
830 * Suffixes A,B,C,D refer to j loop unrolling done with AVX, e.g. for the four different
831 * jnr indices corresponding to data put in the four positions in the SIMD register.
833 int i_shift_offset,i_coord_offset,outeriter,inneriter;
834 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
835 int jnrA,jnrB,jnrC,jnrD;
836 int jnrlistA,jnrlistB,jnrlistC,jnrlistD;
837 int jnrlistE,jnrlistF,jnrlistG,jnrlistH;
838 int j_coord_offsetA,j_coord_offsetB,j_coord_offsetC,j_coord_offsetD;
839 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
841 real *shiftvec,*fshift,*x,*f;
842 real *fjptrA,*fjptrB,*fjptrC,*fjptrD;
844 __m256d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
845 real * vdwioffsetptr0;
846 __m256d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
847 real * vdwioffsetptr1;
848 __m256d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
849 real * vdwioffsetptr2;
850 __m256d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
851 real * vdwioffsetptr3;
852 __m256d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
853 int vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D;
854 __m256d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
855 __m256d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
856 __m256d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
857 __m256d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
858 __m256d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
859 __m256d velec,felec,velecsum,facel,crf,krf,krf2;
862 __m256d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
865 __m256d one_sixth = _mm256_set1_pd(1.0/6.0);
866 __m256d one_twelfth = _mm256_set1_pd(1.0/12.0);
868 __m256d ewtabscale,eweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
869 __m256d beta,beta2,beta3,zeta2,pmecorrF,pmecorrV,rinv3;
871 __m256d dummy_mask,cutoff_mask;
872 __m128 tmpmask0,tmpmask1;
873 __m256d signbit = _mm256_castsi256_pd( _mm256_set1_epi32(0x80000000) );
874 __m256d one = _mm256_set1_pd(1.0);
875 __m256d two = _mm256_set1_pd(2.0);
881 jindex = nlist->jindex;
883 shiftidx = nlist->shift;
885 shiftvec = fr->shift_vec[0];
886 fshift = fr->fshift[0];
887 facel = _mm256_set1_pd(fr->epsfac);
888 charge = mdatoms->chargeA;
889 nvdwtype = fr->ntype;
891 vdwtype = mdatoms->typeA;
893 sh_ewald = _mm256_set1_pd(fr->ic->sh_ewald);
894 beta = _mm256_set1_pd(fr->ic->ewaldcoeff);
895 beta2 = _mm256_mul_pd(beta,beta);
896 beta3 = _mm256_mul_pd(beta,beta2);
898 ewtab = fr->ic->tabq_coul_F;
899 ewtabscale = _mm256_set1_pd(fr->ic->tabq_scale);
900 ewtabhalfspace = _mm256_set1_pd(0.5/fr->ic->tabq_scale);
902 /* Setup water-specific parameters */
903 inr = nlist->iinr[0];
904 iq1 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+1]));
905 iq2 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+2]));
906 iq3 = _mm256_mul_pd(facel,_mm256_set1_pd(charge[inr+3]));
907 vdwioffsetptr0 = vdwparam+2*nvdwtype*vdwtype[inr+0];
909 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
910 rcutoff_scalar = fr->rcoulomb;
911 rcutoff = _mm256_set1_pd(rcutoff_scalar);
912 rcutoff2 = _mm256_mul_pd(rcutoff,rcutoff);
914 sh_vdw_invrcut6 = _mm256_set1_pd(fr->ic->sh_invrc6);
915 rvdw = _mm256_set1_pd(fr->rvdw);
917 /* Avoid stupid compiler warnings */
918 jnrA = jnrB = jnrC = jnrD = 0;
927 for(iidx=0;iidx<4*DIM;iidx++)
932 /* Start outer loop over neighborlists */
933 for(iidx=0; iidx<nri; iidx++)
935 /* Load shift vector for this list */
936 i_shift_offset = DIM*shiftidx[iidx];
938 /* Load limits for loop over neighbors */
939 j_index_start = jindex[iidx];
940 j_index_end = jindex[iidx+1];
942 /* Get outer coordinate index */
944 i_coord_offset = DIM*inr;
946 /* Load i particle coords and add shift vector */
947 gmx_mm256_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
948 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
950 fix0 = _mm256_setzero_pd();
951 fiy0 = _mm256_setzero_pd();
952 fiz0 = _mm256_setzero_pd();
953 fix1 = _mm256_setzero_pd();
954 fiy1 = _mm256_setzero_pd();
955 fiz1 = _mm256_setzero_pd();
956 fix2 = _mm256_setzero_pd();
957 fiy2 = _mm256_setzero_pd();
958 fiz2 = _mm256_setzero_pd();
959 fix3 = _mm256_setzero_pd();
960 fiy3 = _mm256_setzero_pd();
961 fiz3 = _mm256_setzero_pd();
963 /* Start inner kernel loop */
964 for(jidx=j_index_start; jidx<j_index_end && jjnr[jidx+3]>=0; jidx+=4)
967 /* Get j neighbor index, and coordinate index */
972 j_coord_offsetA = DIM*jnrA;
973 j_coord_offsetB = DIM*jnrB;
974 j_coord_offsetC = DIM*jnrC;
975 j_coord_offsetD = DIM*jnrD;
977 /* load j atom coordinates */
978 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
979 x+j_coord_offsetC,x+j_coord_offsetD,
982 /* Calculate displacement vector */
983 dx00 = _mm256_sub_pd(ix0,jx0);
984 dy00 = _mm256_sub_pd(iy0,jy0);
985 dz00 = _mm256_sub_pd(iz0,jz0);
986 dx10 = _mm256_sub_pd(ix1,jx0);
987 dy10 = _mm256_sub_pd(iy1,jy0);
988 dz10 = _mm256_sub_pd(iz1,jz0);
989 dx20 = _mm256_sub_pd(ix2,jx0);
990 dy20 = _mm256_sub_pd(iy2,jy0);
991 dz20 = _mm256_sub_pd(iz2,jz0);
992 dx30 = _mm256_sub_pd(ix3,jx0);
993 dy30 = _mm256_sub_pd(iy3,jy0);
994 dz30 = _mm256_sub_pd(iz3,jz0);
996 /* Calculate squared distance and things based on it */
997 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
998 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
999 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1000 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
1002 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1003 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1004 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
1006 rinvsq00 = gmx_mm256_inv_pd(rsq00);
1007 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1008 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1009 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
1011 /* Load parameters for j particles */
1012 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1013 charge+jnrC+0,charge+jnrD+0);
1014 vdwjidx0A = 2*vdwtype[jnrA+0];
1015 vdwjidx0B = 2*vdwtype[jnrB+0];
1016 vdwjidx0C = 2*vdwtype[jnrC+0];
1017 vdwjidx0D = 2*vdwtype[jnrD+0];
1019 fjx0 = _mm256_setzero_pd();
1020 fjy0 = _mm256_setzero_pd();
1021 fjz0 = _mm256_setzero_pd();
1023 /**************************
1024 * CALCULATE INTERACTIONS *
1025 **************************/
1027 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1030 /* Compute parameters for interactions between i and j atoms */
1031 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1032 vdwioffsetptr0+vdwjidx0B,
1033 vdwioffsetptr0+vdwjidx0C,
1034 vdwioffsetptr0+vdwjidx0D,
1037 /* LENNARD-JONES DISPERSION/REPULSION */
1039 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1040 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1042 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1046 fscal = _mm256_and_pd(fscal,cutoff_mask);
1048 /* Calculate temporary vectorial force */
1049 tx = _mm256_mul_pd(fscal,dx00);
1050 ty = _mm256_mul_pd(fscal,dy00);
1051 tz = _mm256_mul_pd(fscal,dz00);
1053 /* Update vectorial force */
1054 fix0 = _mm256_add_pd(fix0,tx);
1055 fiy0 = _mm256_add_pd(fiy0,ty);
1056 fiz0 = _mm256_add_pd(fiz0,tz);
1058 fjx0 = _mm256_add_pd(fjx0,tx);
1059 fjy0 = _mm256_add_pd(fjy0,ty);
1060 fjz0 = _mm256_add_pd(fjz0,tz);
1064 /**************************
1065 * CALCULATE INTERACTIONS *
1066 **************************/
1068 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1071 r10 = _mm256_mul_pd(rsq10,rinv10);
1073 /* Compute parameters for interactions between i and j atoms */
1074 qq10 = _mm256_mul_pd(iq1,jq0);
1076 /* EWALD ELECTROSTATICS */
1078 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1079 ewrt = _mm256_mul_pd(r10,ewtabscale);
1080 ewitab = _mm256_cvttpd_epi32(ewrt);
1081 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1082 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1083 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1085 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1086 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1088 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1092 fscal = _mm256_and_pd(fscal,cutoff_mask);
1094 /* Calculate temporary vectorial force */
1095 tx = _mm256_mul_pd(fscal,dx10);
1096 ty = _mm256_mul_pd(fscal,dy10);
1097 tz = _mm256_mul_pd(fscal,dz10);
1099 /* Update vectorial force */
1100 fix1 = _mm256_add_pd(fix1,tx);
1101 fiy1 = _mm256_add_pd(fiy1,ty);
1102 fiz1 = _mm256_add_pd(fiz1,tz);
1104 fjx0 = _mm256_add_pd(fjx0,tx);
1105 fjy0 = _mm256_add_pd(fjy0,ty);
1106 fjz0 = _mm256_add_pd(fjz0,tz);
1110 /**************************
1111 * CALCULATE INTERACTIONS *
1112 **************************/
1114 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1117 r20 = _mm256_mul_pd(rsq20,rinv20);
1119 /* Compute parameters for interactions between i and j atoms */
1120 qq20 = _mm256_mul_pd(iq2,jq0);
1122 /* EWALD ELECTROSTATICS */
1124 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1125 ewrt = _mm256_mul_pd(r20,ewtabscale);
1126 ewitab = _mm256_cvttpd_epi32(ewrt);
1127 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1128 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1129 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1131 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1132 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1134 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1138 fscal = _mm256_and_pd(fscal,cutoff_mask);
1140 /* Calculate temporary vectorial force */
1141 tx = _mm256_mul_pd(fscal,dx20);
1142 ty = _mm256_mul_pd(fscal,dy20);
1143 tz = _mm256_mul_pd(fscal,dz20);
1145 /* Update vectorial force */
1146 fix2 = _mm256_add_pd(fix2,tx);
1147 fiy2 = _mm256_add_pd(fiy2,ty);
1148 fiz2 = _mm256_add_pd(fiz2,tz);
1150 fjx0 = _mm256_add_pd(fjx0,tx);
1151 fjy0 = _mm256_add_pd(fjy0,ty);
1152 fjz0 = _mm256_add_pd(fjz0,tz);
1156 /**************************
1157 * CALCULATE INTERACTIONS *
1158 **************************/
1160 if (gmx_mm256_any_lt(rsq30,rcutoff2))
1163 r30 = _mm256_mul_pd(rsq30,rinv30);
1165 /* Compute parameters for interactions between i and j atoms */
1166 qq30 = _mm256_mul_pd(iq3,jq0);
1168 /* EWALD ELECTROSTATICS */
1170 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1171 ewrt = _mm256_mul_pd(r30,ewtabscale);
1172 ewitab = _mm256_cvttpd_epi32(ewrt);
1173 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1174 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1175 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1177 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1178 felec = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
1180 cutoff_mask = _mm256_cmp_pd(rsq30,rcutoff2,_CMP_LT_OQ);
1184 fscal = _mm256_and_pd(fscal,cutoff_mask);
1186 /* Calculate temporary vectorial force */
1187 tx = _mm256_mul_pd(fscal,dx30);
1188 ty = _mm256_mul_pd(fscal,dy30);
1189 tz = _mm256_mul_pd(fscal,dz30);
1191 /* Update vectorial force */
1192 fix3 = _mm256_add_pd(fix3,tx);
1193 fiy3 = _mm256_add_pd(fiy3,ty);
1194 fiz3 = _mm256_add_pd(fiz3,tz);
1196 fjx0 = _mm256_add_pd(fjx0,tx);
1197 fjy0 = _mm256_add_pd(fjy0,ty);
1198 fjz0 = _mm256_add_pd(fjz0,tz);
1202 fjptrA = f+j_coord_offsetA;
1203 fjptrB = f+j_coord_offsetB;
1204 fjptrC = f+j_coord_offsetC;
1205 fjptrD = f+j_coord_offsetD;
1207 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1209 /* Inner loop uses 150 flops */
1212 if(jidx<j_index_end)
1215 /* Get j neighbor index, and coordinate index */
1216 jnrlistA = jjnr[jidx];
1217 jnrlistB = jjnr[jidx+1];
1218 jnrlistC = jjnr[jidx+2];
1219 jnrlistD = jjnr[jidx+3];
1220 /* Sign of each element will be negative for non-real atoms.
1221 * This mask will be 0xFFFFFFFF for dummy entries and 0x0 for real ones,
1222 * so use it as val = _mm_andnot_pd(mask,val) to clear dummy entries.
1224 tmpmask0 = gmx_mm_castsi128_ps(_mm_cmplt_epi32(_mm_loadu_si128((const __m128i *)(jjnr+jidx)),_mm_setzero_si128()));
1226 tmpmask1 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(3,3,2,2));
1227 tmpmask0 = _mm_permute_ps(tmpmask0,_GMX_MM_PERMUTE(1,1,0,0));
1228 dummy_mask = _mm256_castps_pd(gmx_mm256_set_m128(tmpmask1,tmpmask0));
1230 jnrA = (jnrlistA>=0) ? jnrlistA : 0;
1231 jnrB = (jnrlistB>=0) ? jnrlistB : 0;
1232 jnrC = (jnrlistC>=0) ? jnrlistC : 0;
1233 jnrD = (jnrlistD>=0) ? jnrlistD : 0;
1234 j_coord_offsetA = DIM*jnrA;
1235 j_coord_offsetB = DIM*jnrB;
1236 j_coord_offsetC = DIM*jnrC;
1237 j_coord_offsetD = DIM*jnrD;
1239 /* load j atom coordinates */
1240 gmx_mm256_load_1rvec_4ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
1241 x+j_coord_offsetC,x+j_coord_offsetD,
1244 /* Calculate displacement vector */
1245 dx00 = _mm256_sub_pd(ix0,jx0);
1246 dy00 = _mm256_sub_pd(iy0,jy0);
1247 dz00 = _mm256_sub_pd(iz0,jz0);
1248 dx10 = _mm256_sub_pd(ix1,jx0);
1249 dy10 = _mm256_sub_pd(iy1,jy0);
1250 dz10 = _mm256_sub_pd(iz1,jz0);
1251 dx20 = _mm256_sub_pd(ix2,jx0);
1252 dy20 = _mm256_sub_pd(iy2,jy0);
1253 dz20 = _mm256_sub_pd(iz2,jz0);
1254 dx30 = _mm256_sub_pd(ix3,jx0);
1255 dy30 = _mm256_sub_pd(iy3,jy0);
1256 dz30 = _mm256_sub_pd(iz3,jz0);
1258 /* Calculate squared distance and things based on it */
1259 rsq00 = gmx_mm256_calc_rsq_pd(dx00,dy00,dz00);
1260 rsq10 = gmx_mm256_calc_rsq_pd(dx10,dy10,dz10);
1261 rsq20 = gmx_mm256_calc_rsq_pd(dx20,dy20,dz20);
1262 rsq30 = gmx_mm256_calc_rsq_pd(dx30,dy30,dz30);
1264 rinv10 = gmx_mm256_invsqrt_pd(rsq10);
1265 rinv20 = gmx_mm256_invsqrt_pd(rsq20);
1266 rinv30 = gmx_mm256_invsqrt_pd(rsq30);
1268 rinvsq00 = gmx_mm256_inv_pd(rsq00);
1269 rinvsq10 = _mm256_mul_pd(rinv10,rinv10);
1270 rinvsq20 = _mm256_mul_pd(rinv20,rinv20);
1271 rinvsq30 = _mm256_mul_pd(rinv30,rinv30);
1273 /* Load parameters for j particles */
1274 jq0 = gmx_mm256_load_4real_swizzle_pd(charge+jnrA+0,charge+jnrB+0,
1275 charge+jnrC+0,charge+jnrD+0);
1276 vdwjidx0A = 2*vdwtype[jnrA+0];
1277 vdwjidx0B = 2*vdwtype[jnrB+0];
1278 vdwjidx0C = 2*vdwtype[jnrC+0];
1279 vdwjidx0D = 2*vdwtype[jnrD+0];
1281 fjx0 = _mm256_setzero_pd();
1282 fjy0 = _mm256_setzero_pd();
1283 fjz0 = _mm256_setzero_pd();
1285 /**************************
1286 * CALCULATE INTERACTIONS *
1287 **************************/
1289 if (gmx_mm256_any_lt(rsq00,rcutoff2))
1292 /* Compute parameters for interactions between i and j atoms */
1293 gmx_mm256_load_4pair_swizzle_pd(vdwioffsetptr0+vdwjidx0A,
1294 vdwioffsetptr0+vdwjidx0B,
1295 vdwioffsetptr0+vdwjidx0C,
1296 vdwioffsetptr0+vdwjidx0D,
1299 /* LENNARD-JONES DISPERSION/REPULSION */
1301 rinvsix = _mm256_mul_pd(_mm256_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1302 fvdw = _mm256_mul_pd(_mm256_sub_pd(_mm256_mul_pd(c12_00,rinvsix),c6_00),_mm256_mul_pd(rinvsix,rinvsq00));
1304 cutoff_mask = _mm256_cmp_pd(rsq00,rcutoff2,_CMP_LT_OQ);
1308 fscal = _mm256_and_pd(fscal,cutoff_mask);
1310 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1312 /* Calculate temporary vectorial force */
1313 tx = _mm256_mul_pd(fscal,dx00);
1314 ty = _mm256_mul_pd(fscal,dy00);
1315 tz = _mm256_mul_pd(fscal,dz00);
1317 /* Update vectorial force */
1318 fix0 = _mm256_add_pd(fix0,tx);
1319 fiy0 = _mm256_add_pd(fiy0,ty);
1320 fiz0 = _mm256_add_pd(fiz0,tz);
1322 fjx0 = _mm256_add_pd(fjx0,tx);
1323 fjy0 = _mm256_add_pd(fjy0,ty);
1324 fjz0 = _mm256_add_pd(fjz0,tz);
1328 /**************************
1329 * CALCULATE INTERACTIONS *
1330 **************************/
1332 if (gmx_mm256_any_lt(rsq10,rcutoff2))
1335 r10 = _mm256_mul_pd(rsq10,rinv10);
1336 r10 = _mm256_andnot_pd(dummy_mask,r10);
1338 /* Compute parameters for interactions between i and j atoms */
1339 qq10 = _mm256_mul_pd(iq1,jq0);
1341 /* EWALD ELECTROSTATICS */
1343 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1344 ewrt = _mm256_mul_pd(r10,ewtabscale);
1345 ewitab = _mm256_cvttpd_epi32(ewrt);
1346 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1347 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1348 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1350 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1351 felec = _mm256_mul_pd(_mm256_mul_pd(qq10,rinv10),_mm256_sub_pd(rinvsq10,felec));
1353 cutoff_mask = _mm256_cmp_pd(rsq10,rcutoff2,_CMP_LT_OQ);
1357 fscal = _mm256_and_pd(fscal,cutoff_mask);
1359 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1361 /* Calculate temporary vectorial force */
1362 tx = _mm256_mul_pd(fscal,dx10);
1363 ty = _mm256_mul_pd(fscal,dy10);
1364 tz = _mm256_mul_pd(fscal,dz10);
1366 /* Update vectorial force */
1367 fix1 = _mm256_add_pd(fix1,tx);
1368 fiy1 = _mm256_add_pd(fiy1,ty);
1369 fiz1 = _mm256_add_pd(fiz1,tz);
1371 fjx0 = _mm256_add_pd(fjx0,tx);
1372 fjy0 = _mm256_add_pd(fjy0,ty);
1373 fjz0 = _mm256_add_pd(fjz0,tz);
1377 /**************************
1378 * CALCULATE INTERACTIONS *
1379 **************************/
1381 if (gmx_mm256_any_lt(rsq20,rcutoff2))
1384 r20 = _mm256_mul_pd(rsq20,rinv20);
1385 r20 = _mm256_andnot_pd(dummy_mask,r20);
1387 /* Compute parameters for interactions between i and j atoms */
1388 qq20 = _mm256_mul_pd(iq2,jq0);
1390 /* EWALD ELECTROSTATICS */
1392 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1393 ewrt = _mm256_mul_pd(r20,ewtabscale);
1394 ewitab = _mm256_cvttpd_epi32(ewrt);
1395 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1396 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1397 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1399 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1400 felec = _mm256_mul_pd(_mm256_mul_pd(qq20,rinv20),_mm256_sub_pd(rinvsq20,felec));
1402 cutoff_mask = _mm256_cmp_pd(rsq20,rcutoff2,_CMP_LT_OQ);
1406 fscal = _mm256_and_pd(fscal,cutoff_mask);
1408 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1410 /* Calculate temporary vectorial force */
1411 tx = _mm256_mul_pd(fscal,dx20);
1412 ty = _mm256_mul_pd(fscal,dy20);
1413 tz = _mm256_mul_pd(fscal,dz20);
1415 /* Update vectorial force */
1416 fix2 = _mm256_add_pd(fix2,tx);
1417 fiy2 = _mm256_add_pd(fiy2,ty);
1418 fiz2 = _mm256_add_pd(fiz2,tz);
1420 fjx0 = _mm256_add_pd(fjx0,tx);
1421 fjy0 = _mm256_add_pd(fjy0,ty);
1422 fjz0 = _mm256_add_pd(fjz0,tz);
1426 /**************************
1427 * CALCULATE INTERACTIONS *
1428 **************************/
1430 if (gmx_mm256_any_lt(rsq30,rcutoff2))
1433 r30 = _mm256_mul_pd(rsq30,rinv30);
1434 r30 = _mm256_andnot_pd(dummy_mask,r30);
1436 /* Compute parameters for interactions between i and j atoms */
1437 qq30 = _mm256_mul_pd(iq3,jq0);
1439 /* EWALD ELECTROSTATICS */
1441 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1442 ewrt = _mm256_mul_pd(r30,ewtabscale);
1443 ewitab = _mm256_cvttpd_epi32(ewrt);
1444 eweps = _mm256_sub_pd(ewrt,_mm256_round_pd(ewrt, _MM_FROUND_FLOOR));
1445 gmx_mm256_load_4pair_swizzle_pd(ewtab + _mm_extract_epi32(ewitab,0),ewtab + _mm_extract_epi32(ewitab,1),
1446 ewtab + _mm_extract_epi32(ewitab,2),ewtab + _mm_extract_epi32(ewitab,3),
1448 felec = _mm256_add_pd(_mm256_mul_pd( _mm256_sub_pd(one,eweps),ewtabF),_mm256_mul_pd(eweps,ewtabFn));
1449 felec = _mm256_mul_pd(_mm256_mul_pd(qq30,rinv30),_mm256_sub_pd(rinvsq30,felec));
1451 cutoff_mask = _mm256_cmp_pd(rsq30,rcutoff2,_CMP_LT_OQ);
1455 fscal = _mm256_and_pd(fscal,cutoff_mask);
1457 fscal = _mm256_andnot_pd(dummy_mask,fscal);
1459 /* Calculate temporary vectorial force */
1460 tx = _mm256_mul_pd(fscal,dx30);
1461 ty = _mm256_mul_pd(fscal,dy30);
1462 tz = _mm256_mul_pd(fscal,dz30);
1464 /* Update vectorial force */
1465 fix3 = _mm256_add_pd(fix3,tx);
1466 fiy3 = _mm256_add_pd(fiy3,ty);
1467 fiz3 = _mm256_add_pd(fiz3,tz);
1469 fjx0 = _mm256_add_pd(fjx0,tx);
1470 fjy0 = _mm256_add_pd(fjy0,ty);
1471 fjz0 = _mm256_add_pd(fjz0,tz);
1475 fjptrA = (jnrlistA>=0) ? f+j_coord_offsetA : scratch;
1476 fjptrB = (jnrlistB>=0) ? f+j_coord_offsetB : scratch;
1477 fjptrC = (jnrlistC>=0) ? f+j_coord_offsetC : scratch;
1478 fjptrD = (jnrlistD>=0) ? f+j_coord_offsetD : scratch;
1480 gmx_mm256_decrement_1rvec_4ptr_swizzle_pd(fjptrA,fjptrB,fjptrC,fjptrD,fjx0,fjy0,fjz0);
1482 /* Inner loop uses 153 flops */
1485 /* End of innermost loop */
1487 gmx_mm256_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1488 f+i_coord_offset,fshift+i_shift_offset);
1490 /* Increment number of inner iterations */
1491 inneriter += j_index_end - j_index_start;
1493 /* Outer loop uses 24 flops */
1496 /* Increment number of outer iterations */
1499 /* Update outer/inner flops */
1501 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*153);