2 * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_double
38 * Electrostatics interaction: Coulomb
39 * VdW interaction: CubicSplineTable
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_VF_avx_128_fma_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
77 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
83 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
86 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
87 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
89 __m128i ifour = _mm_set1_epi32(4);
90 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
92 __m128d dummy_mask,cutoff_mask;
93 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
94 __m128d one = _mm_set1_pd(1.0);
95 __m128d two = _mm_set1_pd(2.0);
101 jindex = nlist->jindex;
103 shiftidx = nlist->shift;
105 shiftvec = fr->shift_vec[0];
106 fshift = fr->fshift[0];
107 facel = _mm_set1_pd(fr->epsfac);
108 charge = mdatoms->chargeA;
109 nvdwtype = fr->ntype;
111 vdwtype = mdatoms->typeA;
113 vftab = kernel_data->table_vdw->data;
114 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
116 /* Setup water-specific parameters */
117 inr = nlist->iinr[0];
118 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
119 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
120 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
121 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
123 /* Avoid stupid compiler warnings */
131 /* Start outer loop over neighborlists */
132 for(iidx=0; iidx<nri; iidx++)
134 /* Load shift vector for this list */
135 i_shift_offset = DIM*shiftidx[iidx];
137 /* Load limits for loop over neighbors */
138 j_index_start = jindex[iidx];
139 j_index_end = jindex[iidx+1];
141 /* Get outer coordinate index */
143 i_coord_offset = DIM*inr;
145 /* Load i particle coords and add shift vector */
146 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
147 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
149 fix0 = _mm_setzero_pd();
150 fiy0 = _mm_setzero_pd();
151 fiz0 = _mm_setzero_pd();
152 fix1 = _mm_setzero_pd();
153 fiy1 = _mm_setzero_pd();
154 fiz1 = _mm_setzero_pd();
155 fix2 = _mm_setzero_pd();
156 fiy2 = _mm_setzero_pd();
157 fiz2 = _mm_setzero_pd();
158 fix3 = _mm_setzero_pd();
159 fiy3 = _mm_setzero_pd();
160 fiz3 = _mm_setzero_pd();
162 /* Reset potential sums */
163 velecsum = _mm_setzero_pd();
164 vvdwsum = _mm_setzero_pd();
166 /* Start inner kernel loop */
167 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
170 /* Get j neighbor index, and coordinate index */
173 j_coord_offsetA = DIM*jnrA;
174 j_coord_offsetB = DIM*jnrB;
176 /* load j atom coordinates */
177 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
180 /* Calculate displacement vector */
181 dx00 = _mm_sub_pd(ix0,jx0);
182 dy00 = _mm_sub_pd(iy0,jy0);
183 dz00 = _mm_sub_pd(iz0,jz0);
184 dx10 = _mm_sub_pd(ix1,jx0);
185 dy10 = _mm_sub_pd(iy1,jy0);
186 dz10 = _mm_sub_pd(iz1,jz0);
187 dx20 = _mm_sub_pd(ix2,jx0);
188 dy20 = _mm_sub_pd(iy2,jy0);
189 dz20 = _mm_sub_pd(iz2,jz0);
190 dx30 = _mm_sub_pd(ix3,jx0);
191 dy30 = _mm_sub_pd(iy3,jy0);
192 dz30 = _mm_sub_pd(iz3,jz0);
194 /* Calculate squared distance and things based on it */
195 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
196 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
197 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
198 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
200 rinv00 = gmx_mm_invsqrt_pd(rsq00);
201 rinv10 = gmx_mm_invsqrt_pd(rsq10);
202 rinv20 = gmx_mm_invsqrt_pd(rsq20);
203 rinv30 = gmx_mm_invsqrt_pd(rsq30);
205 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
206 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
207 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
209 /* Load parameters for j particles */
210 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
211 vdwjidx0A = 2*vdwtype[jnrA+0];
212 vdwjidx0B = 2*vdwtype[jnrB+0];
214 fjx0 = _mm_setzero_pd();
215 fjy0 = _mm_setzero_pd();
216 fjz0 = _mm_setzero_pd();
218 /**************************
219 * CALCULATE INTERACTIONS *
220 **************************/
222 r00 = _mm_mul_pd(rsq00,rinv00);
224 /* Compute parameters for interactions between i and j atoms */
225 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
226 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
228 /* Calculate table index by multiplying r with table scale and truncate to integer */
229 rt = _mm_mul_pd(r00,vftabscale);
230 vfitab = _mm_cvttpd_epi32(rt);
232 vfeps = _mm_frcz_pd(rt);
234 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
236 twovfeps = _mm_add_pd(vfeps,vfeps);
237 vfitab = _mm_slli_epi32(vfitab,3);
239 /* CUBIC SPLINE TABLE DISPERSION */
240 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
241 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
242 GMX_MM_TRANSPOSE2_PD(Y,F);
243 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
244 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
245 GMX_MM_TRANSPOSE2_PD(G,H);
246 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
247 VV = _mm_macc_pd(vfeps,Fp,Y);
248 vvdw6 = _mm_mul_pd(c6_00,VV);
249 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
250 fvdw6 = _mm_mul_pd(c6_00,FF);
252 /* CUBIC SPLINE TABLE REPULSION */
253 vfitab = _mm_add_epi32(vfitab,ifour);
254 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
255 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
256 GMX_MM_TRANSPOSE2_PD(Y,F);
257 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
258 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
259 GMX_MM_TRANSPOSE2_PD(G,H);
260 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
261 VV = _mm_macc_pd(vfeps,Fp,Y);
262 vvdw12 = _mm_mul_pd(c12_00,VV);
263 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
264 fvdw12 = _mm_mul_pd(c12_00,FF);
265 vvdw = _mm_add_pd(vvdw12,vvdw6);
266 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
268 /* Update potential sum for this i atom from the interaction with this j atom. */
269 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
273 /* Update vectorial force */
274 fix0 = _mm_macc_pd(dx00,fscal,fix0);
275 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
276 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
278 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
279 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
280 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
282 /**************************
283 * CALCULATE INTERACTIONS *
284 **************************/
286 /* Compute parameters for interactions between i and j atoms */
287 qq10 = _mm_mul_pd(iq1,jq0);
289 /* COULOMB ELECTROSTATICS */
290 velec = _mm_mul_pd(qq10,rinv10);
291 felec = _mm_mul_pd(velec,rinvsq10);
293 /* Update potential sum for this i atom from the interaction with this j atom. */
294 velecsum = _mm_add_pd(velecsum,velec);
298 /* Update vectorial force */
299 fix1 = _mm_macc_pd(dx10,fscal,fix1);
300 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
301 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
303 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
304 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
305 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
307 /**************************
308 * CALCULATE INTERACTIONS *
309 **************************/
311 /* Compute parameters for interactions between i and j atoms */
312 qq20 = _mm_mul_pd(iq2,jq0);
314 /* COULOMB ELECTROSTATICS */
315 velec = _mm_mul_pd(qq20,rinv20);
316 felec = _mm_mul_pd(velec,rinvsq20);
318 /* Update potential sum for this i atom from the interaction with this j atom. */
319 velecsum = _mm_add_pd(velecsum,velec);
323 /* Update vectorial force */
324 fix2 = _mm_macc_pd(dx20,fscal,fix2);
325 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
326 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
328 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
329 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
330 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
332 /**************************
333 * CALCULATE INTERACTIONS *
334 **************************/
336 /* Compute parameters for interactions between i and j atoms */
337 qq30 = _mm_mul_pd(iq3,jq0);
339 /* COULOMB ELECTROSTATICS */
340 velec = _mm_mul_pd(qq30,rinv30);
341 felec = _mm_mul_pd(velec,rinvsq30);
343 /* Update potential sum for this i atom from the interaction with this j atom. */
344 velecsum = _mm_add_pd(velecsum,velec);
348 /* Update vectorial force */
349 fix3 = _mm_macc_pd(dx30,fscal,fix3);
350 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
351 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
353 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
354 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
355 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
357 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
359 /* Inner loop uses 155 flops */
366 j_coord_offsetA = DIM*jnrA;
368 /* load j atom coordinates */
369 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
372 /* Calculate displacement vector */
373 dx00 = _mm_sub_pd(ix0,jx0);
374 dy00 = _mm_sub_pd(iy0,jy0);
375 dz00 = _mm_sub_pd(iz0,jz0);
376 dx10 = _mm_sub_pd(ix1,jx0);
377 dy10 = _mm_sub_pd(iy1,jy0);
378 dz10 = _mm_sub_pd(iz1,jz0);
379 dx20 = _mm_sub_pd(ix2,jx0);
380 dy20 = _mm_sub_pd(iy2,jy0);
381 dz20 = _mm_sub_pd(iz2,jz0);
382 dx30 = _mm_sub_pd(ix3,jx0);
383 dy30 = _mm_sub_pd(iy3,jy0);
384 dz30 = _mm_sub_pd(iz3,jz0);
386 /* Calculate squared distance and things based on it */
387 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
388 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
389 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
390 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
392 rinv00 = gmx_mm_invsqrt_pd(rsq00);
393 rinv10 = gmx_mm_invsqrt_pd(rsq10);
394 rinv20 = gmx_mm_invsqrt_pd(rsq20);
395 rinv30 = gmx_mm_invsqrt_pd(rsq30);
397 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
398 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
399 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
401 /* Load parameters for j particles */
402 jq0 = _mm_load_sd(charge+jnrA+0);
403 vdwjidx0A = 2*vdwtype[jnrA+0];
405 fjx0 = _mm_setzero_pd();
406 fjy0 = _mm_setzero_pd();
407 fjz0 = _mm_setzero_pd();
409 /**************************
410 * CALCULATE INTERACTIONS *
411 **************************/
413 r00 = _mm_mul_pd(rsq00,rinv00);
415 /* Compute parameters for interactions between i and j atoms */
416 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
418 /* Calculate table index by multiplying r with table scale and truncate to integer */
419 rt = _mm_mul_pd(r00,vftabscale);
420 vfitab = _mm_cvttpd_epi32(rt);
422 vfeps = _mm_frcz_pd(rt);
424 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
426 twovfeps = _mm_add_pd(vfeps,vfeps);
427 vfitab = _mm_slli_epi32(vfitab,3);
429 /* CUBIC SPLINE TABLE DISPERSION */
430 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
431 F = _mm_setzero_pd();
432 GMX_MM_TRANSPOSE2_PD(Y,F);
433 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
434 H = _mm_setzero_pd();
435 GMX_MM_TRANSPOSE2_PD(G,H);
436 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
437 VV = _mm_macc_pd(vfeps,Fp,Y);
438 vvdw6 = _mm_mul_pd(c6_00,VV);
439 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
440 fvdw6 = _mm_mul_pd(c6_00,FF);
442 /* CUBIC SPLINE TABLE REPULSION */
443 vfitab = _mm_add_epi32(vfitab,ifour);
444 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
445 F = _mm_setzero_pd();
446 GMX_MM_TRANSPOSE2_PD(Y,F);
447 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
448 H = _mm_setzero_pd();
449 GMX_MM_TRANSPOSE2_PD(G,H);
450 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
451 VV = _mm_macc_pd(vfeps,Fp,Y);
452 vvdw12 = _mm_mul_pd(c12_00,VV);
453 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
454 fvdw12 = _mm_mul_pd(c12_00,FF);
455 vvdw = _mm_add_pd(vvdw12,vvdw6);
456 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
458 /* Update potential sum for this i atom from the interaction with this j atom. */
459 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
460 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
464 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
466 /* Update vectorial force */
467 fix0 = _mm_macc_pd(dx00,fscal,fix0);
468 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
469 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
471 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
472 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
473 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
475 /**************************
476 * CALCULATE INTERACTIONS *
477 **************************/
479 /* Compute parameters for interactions between i and j atoms */
480 qq10 = _mm_mul_pd(iq1,jq0);
482 /* COULOMB ELECTROSTATICS */
483 velec = _mm_mul_pd(qq10,rinv10);
484 felec = _mm_mul_pd(velec,rinvsq10);
486 /* Update potential sum for this i atom from the interaction with this j atom. */
487 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
488 velecsum = _mm_add_pd(velecsum,velec);
492 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
494 /* Update vectorial force */
495 fix1 = _mm_macc_pd(dx10,fscal,fix1);
496 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
497 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
499 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
500 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
501 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
503 /**************************
504 * CALCULATE INTERACTIONS *
505 **************************/
507 /* Compute parameters for interactions between i and j atoms */
508 qq20 = _mm_mul_pd(iq2,jq0);
510 /* COULOMB ELECTROSTATICS */
511 velec = _mm_mul_pd(qq20,rinv20);
512 felec = _mm_mul_pd(velec,rinvsq20);
514 /* Update potential sum for this i atom from the interaction with this j atom. */
515 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
516 velecsum = _mm_add_pd(velecsum,velec);
520 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
522 /* Update vectorial force */
523 fix2 = _mm_macc_pd(dx20,fscal,fix2);
524 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
525 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
527 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
528 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
529 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
531 /**************************
532 * CALCULATE INTERACTIONS *
533 **************************/
535 /* Compute parameters for interactions between i and j atoms */
536 qq30 = _mm_mul_pd(iq3,jq0);
538 /* COULOMB ELECTROSTATICS */
539 velec = _mm_mul_pd(qq30,rinv30);
540 felec = _mm_mul_pd(velec,rinvsq30);
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
544 velecsum = _mm_add_pd(velecsum,velec);
548 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
550 /* Update vectorial force */
551 fix3 = _mm_macc_pd(dx30,fscal,fix3);
552 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
553 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
555 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
556 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
557 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
559 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
561 /* Inner loop uses 155 flops */
564 /* End of innermost loop */
566 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
567 f+i_coord_offset,fshift+i_shift_offset);
570 /* Update potential energies */
571 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
572 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
574 /* Increment number of inner iterations */
575 inneriter += j_index_end - j_index_start;
577 /* Outer loop uses 26 flops */
580 /* Increment number of outer iterations */
583 /* Update outer/inner flops */
585 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*155);
588 * Gromacs nonbonded kernel: nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_double
589 * Electrostatics interaction: Coulomb
590 * VdW interaction: CubicSplineTable
591 * Geometry: Water4-Particle
592 * Calculate force/pot: Force
595 nb_kernel_ElecCoul_VdwCSTab_GeomW4P1_F_avx_128_fma_double
596 (t_nblist * gmx_restrict nlist,
597 rvec * gmx_restrict xx,
598 rvec * gmx_restrict ff,
599 t_forcerec * gmx_restrict fr,
600 t_mdatoms * gmx_restrict mdatoms,
601 nb_kernel_data_t * gmx_restrict kernel_data,
602 t_nrnb * gmx_restrict nrnb)
604 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
605 * just 0 for non-waters.
606 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
607 * jnr indices corresponding to data put in the four positions in the SIMD register.
609 int i_shift_offset,i_coord_offset,outeriter,inneriter;
610 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
612 int j_coord_offsetA,j_coord_offsetB;
613 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
615 real *shiftvec,*fshift,*x,*f;
616 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
618 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
620 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
622 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
624 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
625 int vdwjidx0A,vdwjidx0B;
626 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
627 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
628 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
629 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
630 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
631 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
634 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
637 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
638 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
640 __m128i ifour = _mm_set1_epi32(4);
641 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
643 __m128d dummy_mask,cutoff_mask;
644 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
645 __m128d one = _mm_set1_pd(1.0);
646 __m128d two = _mm_set1_pd(2.0);
652 jindex = nlist->jindex;
654 shiftidx = nlist->shift;
656 shiftvec = fr->shift_vec[0];
657 fshift = fr->fshift[0];
658 facel = _mm_set1_pd(fr->epsfac);
659 charge = mdatoms->chargeA;
660 nvdwtype = fr->ntype;
662 vdwtype = mdatoms->typeA;
664 vftab = kernel_data->table_vdw->data;
665 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
667 /* Setup water-specific parameters */
668 inr = nlist->iinr[0];
669 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
670 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
671 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
672 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
674 /* Avoid stupid compiler warnings */
682 /* Start outer loop over neighborlists */
683 for(iidx=0; iidx<nri; iidx++)
685 /* Load shift vector for this list */
686 i_shift_offset = DIM*shiftidx[iidx];
688 /* Load limits for loop over neighbors */
689 j_index_start = jindex[iidx];
690 j_index_end = jindex[iidx+1];
692 /* Get outer coordinate index */
694 i_coord_offset = DIM*inr;
696 /* Load i particle coords and add shift vector */
697 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
698 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
700 fix0 = _mm_setzero_pd();
701 fiy0 = _mm_setzero_pd();
702 fiz0 = _mm_setzero_pd();
703 fix1 = _mm_setzero_pd();
704 fiy1 = _mm_setzero_pd();
705 fiz1 = _mm_setzero_pd();
706 fix2 = _mm_setzero_pd();
707 fiy2 = _mm_setzero_pd();
708 fiz2 = _mm_setzero_pd();
709 fix3 = _mm_setzero_pd();
710 fiy3 = _mm_setzero_pd();
711 fiz3 = _mm_setzero_pd();
713 /* Start inner kernel loop */
714 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
717 /* Get j neighbor index, and coordinate index */
720 j_coord_offsetA = DIM*jnrA;
721 j_coord_offsetB = DIM*jnrB;
723 /* load j atom coordinates */
724 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
727 /* Calculate displacement vector */
728 dx00 = _mm_sub_pd(ix0,jx0);
729 dy00 = _mm_sub_pd(iy0,jy0);
730 dz00 = _mm_sub_pd(iz0,jz0);
731 dx10 = _mm_sub_pd(ix1,jx0);
732 dy10 = _mm_sub_pd(iy1,jy0);
733 dz10 = _mm_sub_pd(iz1,jz0);
734 dx20 = _mm_sub_pd(ix2,jx0);
735 dy20 = _mm_sub_pd(iy2,jy0);
736 dz20 = _mm_sub_pd(iz2,jz0);
737 dx30 = _mm_sub_pd(ix3,jx0);
738 dy30 = _mm_sub_pd(iy3,jy0);
739 dz30 = _mm_sub_pd(iz3,jz0);
741 /* Calculate squared distance and things based on it */
742 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
743 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
744 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
745 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
747 rinv00 = gmx_mm_invsqrt_pd(rsq00);
748 rinv10 = gmx_mm_invsqrt_pd(rsq10);
749 rinv20 = gmx_mm_invsqrt_pd(rsq20);
750 rinv30 = gmx_mm_invsqrt_pd(rsq30);
752 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
753 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
754 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
756 /* Load parameters for j particles */
757 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
758 vdwjidx0A = 2*vdwtype[jnrA+0];
759 vdwjidx0B = 2*vdwtype[jnrB+0];
761 fjx0 = _mm_setzero_pd();
762 fjy0 = _mm_setzero_pd();
763 fjz0 = _mm_setzero_pd();
765 /**************************
766 * CALCULATE INTERACTIONS *
767 **************************/
769 r00 = _mm_mul_pd(rsq00,rinv00);
771 /* Compute parameters for interactions between i and j atoms */
772 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
773 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
775 /* Calculate table index by multiplying r with table scale and truncate to integer */
776 rt = _mm_mul_pd(r00,vftabscale);
777 vfitab = _mm_cvttpd_epi32(rt);
779 vfeps = _mm_frcz_pd(rt);
781 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
783 twovfeps = _mm_add_pd(vfeps,vfeps);
784 vfitab = _mm_slli_epi32(vfitab,3);
786 /* CUBIC SPLINE TABLE DISPERSION */
787 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
788 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
789 GMX_MM_TRANSPOSE2_PD(Y,F);
790 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
791 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
792 GMX_MM_TRANSPOSE2_PD(G,H);
793 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
794 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
795 fvdw6 = _mm_mul_pd(c6_00,FF);
797 /* CUBIC SPLINE TABLE REPULSION */
798 vfitab = _mm_add_epi32(vfitab,ifour);
799 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
800 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
801 GMX_MM_TRANSPOSE2_PD(Y,F);
802 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
803 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
804 GMX_MM_TRANSPOSE2_PD(G,H);
805 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
806 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
807 fvdw12 = _mm_mul_pd(c12_00,FF);
808 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
812 /* Update vectorial force */
813 fix0 = _mm_macc_pd(dx00,fscal,fix0);
814 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
815 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
817 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
818 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
819 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
821 /**************************
822 * CALCULATE INTERACTIONS *
823 **************************/
825 /* Compute parameters for interactions between i and j atoms */
826 qq10 = _mm_mul_pd(iq1,jq0);
828 /* COULOMB ELECTROSTATICS */
829 velec = _mm_mul_pd(qq10,rinv10);
830 felec = _mm_mul_pd(velec,rinvsq10);
834 /* Update vectorial force */
835 fix1 = _mm_macc_pd(dx10,fscal,fix1);
836 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
837 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
839 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
840 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
841 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
843 /**************************
844 * CALCULATE INTERACTIONS *
845 **************************/
847 /* Compute parameters for interactions between i and j atoms */
848 qq20 = _mm_mul_pd(iq2,jq0);
850 /* COULOMB ELECTROSTATICS */
851 velec = _mm_mul_pd(qq20,rinv20);
852 felec = _mm_mul_pd(velec,rinvsq20);
856 /* Update vectorial force */
857 fix2 = _mm_macc_pd(dx20,fscal,fix2);
858 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
859 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
861 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
862 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
863 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
865 /**************************
866 * CALCULATE INTERACTIONS *
867 **************************/
869 /* Compute parameters for interactions between i and j atoms */
870 qq30 = _mm_mul_pd(iq3,jq0);
872 /* COULOMB ELECTROSTATICS */
873 velec = _mm_mul_pd(qq30,rinv30);
874 felec = _mm_mul_pd(velec,rinvsq30);
878 /* Update vectorial force */
879 fix3 = _mm_macc_pd(dx30,fscal,fix3);
880 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
881 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
883 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
884 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
885 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
887 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
889 /* Inner loop uses 144 flops */
896 j_coord_offsetA = DIM*jnrA;
898 /* load j atom coordinates */
899 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
902 /* Calculate displacement vector */
903 dx00 = _mm_sub_pd(ix0,jx0);
904 dy00 = _mm_sub_pd(iy0,jy0);
905 dz00 = _mm_sub_pd(iz0,jz0);
906 dx10 = _mm_sub_pd(ix1,jx0);
907 dy10 = _mm_sub_pd(iy1,jy0);
908 dz10 = _mm_sub_pd(iz1,jz0);
909 dx20 = _mm_sub_pd(ix2,jx0);
910 dy20 = _mm_sub_pd(iy2,jy0);
911 dz20 = _mm_sub_pd(iz2,jz0);
912 dx30 = _mm_sub_pd(ix3,jx0);
913 dy30 = _mm_sub_pd(iy3,jy0);
914 dz30 = _mm_sub_pd(iz3,jz0);
916 /* Calculate squared distance and things based on it */
917 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
918 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
919 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
920 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
922 rinv00 = gmx_mm_invsqrt_pd(rsq00);
923 rinv10 = gmx_mm_invsqrt_pd(rsq10);
924 rinv20 = gmx_mm_invsqrt_pd(rsq20);
925 rinv30 = gmx_mm_invsqrt_pd(rsq30);
927 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
928 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
929 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
931 /* Load parameters for j particles */
932 jq0 = _mm_load_sd(charge+jnrA+0);
933 vdwjidx0A = 2*vdwtype[jnrA+0];
935 fjx0 = _mm_setzero_pd();
936 fjy0 = _mm_setzero_pd();
937 fjz0 = _mm_setzero_pd();
939 /**************************
940 * CALCULATE INTERACTIONS *
941 **************************/
943 r00 = _mm_mul_pd(rsq00,rinv00);
945 /* Compute parameters for interactions between i and j atoms */
946 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
948 /* Calculate table index by multiplying r with table scale and truncate to integer */
949 rt = _mm_mul_pd(r00,vftabscale);
950 vfitab = _mm_cvttpd_epi32(rt);
952 vfeps = _mm_frcz_pd(rt);
954 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
956 twovfeps = _mm_add_pd(vfeps,vfeps);
957 vfitab = _mm_slli_epi32(vfitab,3);
959 /* CUBIC SPLINE TABLE DISPERSION */
960 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
961 F = _mm_setzero_pd();
962 GMX_MM_TRANSPOSE2_PD(Y,F);
963 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
964 H = _mm_setzero_pd();
965 GMX_MM_TRANSPOSE2_PD(G,H);
966 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
967 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
968 fvdw6 = _mm_mul_pd(c6_00,FF);
970 /* CUBIC SPLINE TABLE REPULSION */
971 vfitab = _mm_add_epi32(vfitab,ifour);
972 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
973 F = _mm_setzero_pd();
974 GMX_MM_TRANSPOSE2_PD(Y,F);
975 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
976 H = _mm_setzero_pd();
977 GMX_MM_TRANSPOSE2_PD(G,H);
978 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
979 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
980 fvdw12 = _mm_mul_pd(c12_00,FF);
981 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
985 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
987 /* Update vectorial force */
988 fix0 = _mm_macc_pd(dx00,fscal,fix0);
989 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
990 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
992 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
993 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
994 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
996 /**************************
997 * CALCULATE INTERACTIONS *
998 **************************/
1000 /* Compute parameters for interactions between i and j atoms */
1001 qq10 = _mm_mul_pd(iq1,jq0);
1003 /* COULOMB ELECTROSTATICS */
1004 velec = _mm_mul_pd(qq10,rinv10);
1005 felec = _mm_mul_pd(velec,rinvsq10);
1009 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1011 /* Update vectorial force */
1012 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1013 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1014 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1016 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1017 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1018 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1020 /**************************
1021 * CALCULATE INTERACTIONS *
1022 **************************/
1024 /* Compute parameters for interactions between i and j atoms */
1025 qq20 = _mm_mul_pd(iq2,jq0);
1027 /* COULOMB ELECTROSTATICS */
1028 velec = _mm_mul_pd(qq20,rinv20);
1029 felec = _mm_mul_pd(velec,rinvsq20);
1033 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1035 /* Update vectorial force */
1036 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1037 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1038 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1040 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1041 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1042 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1044 /**************************
1045 * CALCULATE INTERACTIONS *
1046 **************************/
1048 /* Compute parameters for interactions between i and j atoms */
1049 qq30 = _mm_mul_pd(iq3,jq0);
1051 /* COULOMB ELECTROSTATICS */
1052 velec = _mm_mul_pd(qq30,rinv30);
1053 felec = _mm_mul_pd(velec,rinvsq30);
1057 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1059 /* Update vectorial force */
1060 fix3 = _mm_macc_pd(dx30,fscal,fix3);
1061 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
1062 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
1064 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
1065 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
1066 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
1068 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1070 /* Inner loop uses 144 flops */
1073 /* End of innermost loop */
1075 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1076 f+i_coord_offset,fshift+i_shift_offset);
1078 /* Increment number of inner iterations */
1079 inneriter += j_index_end - j_index_start;
1081 /* Outer loop uses 24 flops */
1084 /* Increment number of outer iterations */
1087 /* Update outer/inner flops */
1089 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*144);