2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2012,2013,2014,2015,2017,2018, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 * Note: this file was generated by the GROMACS avx_128_fma_double kernel generator.
44 #include "../nb_kernel.h"
45 #include "gromacs/gmxlib/nrnb.h"
47 #include "kernelutil_x86_avx_128_fma_double.h"
50 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_double
51 * Electrostatics interaction: Ewald
52 * VdW interaction: CubicSplineTable
53 * Geometry: Water4-Particle
54 * Calculate force/pot: PotentialAndForce
57 nb_kernel_ElecEw_VdwCSTab_GeomW4P1_VF_avx_128_fma_double
58 (t_nblist * gmx_restrict nlist,
59 rvec * gmx_restrict xx,
60 rvec * gmx_restrict ff,
61 struct t_forcerec * gmx_restrict fr,
62 t_mdatoms * gmx_restrict mdatoms,
63 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
64 t_nrnb * gmx_restrict nrnb)
66 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
67 * just 0 for non-waters.
68 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
69 * jnr indices corresponding to data put in the four positions in the SIMD register.
71 int i_shift_offset,i_coord_offset,outeriter,inneriter;
72 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
74 int j_coord_offsetA,j_coord_offsetB;
75 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
77 real *shiftvec,*fshift,*x,*f;
78 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
80 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
82 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
84 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
86 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
87 int vdwjidx0A,vdwjidx0B;
88 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
89 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
90 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
91 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
92 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
93 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
96 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
99 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
100 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
102 __m128i ifour = _mm_set1_epi32(4);
103 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
106 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
108 __m128d dummy_mask,cutoff_mask;
109 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
110 __m128d one = _mm_set1_pd(1.0);
111 __m128d two = _mm_set1_pd(2.0);
117 jindex = nlist->jindex;
119 shiftidx = nlist->shift;
121 shiftvec = fr->shift_vec[0];
122 fshift = fr->fshift[0];
123 facel = _mm_set1_pd(fr->ic->epsfac);
124 charge = mdatoms->chargeA;
125 nvdwtype = fr->ntype;
127 vdwtype = mdatoms->typeA;
129 vftab = kernel_data->table_vdw->data;
130 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
132 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
133 ewtab = fr->ic->tabq_coul_FDV0;
134 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
135 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
137 /* Setup water-specific parameters */
138 inr = nlist->iinr[0];
139 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
140 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
141 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
142 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
144 /* Avoid stupid compiler warnings */
152 /* Start outer loop over neighborlists */
153 for(iidx=0; iidx<nri; iidx++)
155 /* Load shift vector for this list */
156 i_shift_offset = DIM*shiftidx[iidx];
158 /* Load limits for loop over neighbors */
159 j_index_start = jindex[iidx];
160 j_index_end = jindex[iidx+1];
162 /* Get outer coordinate index */
164 i_coord_offset = DIM*inr;
166 /* Load i particle coords and add shift vector */
167 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
168 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
170 fix0 = _mm_setzero_pd();
171 fiy0 = _mm_setzero_pd();
172 fiz0 = _mm_setzero_pd();
173 fix1 = _mm_setzero_pd();
174 fiy1 = _mm_setzero_pd();
175 fiz1 = _mm_setzero_pd();
176 fix2 = _mm_setzero_pd();
177 fiy2 = _mm_setzero_pd();
178 fiz2 = _mm_setzero_pd();
179 fix3 = _mm_setzero_pd();
180 fiy3 = _mm_setzero_pd();
181 fiz3 = _mm_setzero_pd();
183 /* Reset potential sums */
184 velecsum = _mm_setzero_pd();
185 vvdwsum = _mm_setzero_pd();
187 /* Start inner kernel loop */
188 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
191 /* Get j neighbor index, and coordinate index */
194 j_coord_offsetA = DIM*jnrA;
195 j_coord_offsetB = DIM*jnrB;
197 /* load j atom coordinates */
198 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
201 /* Calculate displacement vector */
202 dx00 = _mm_sub_pd(ix0,jx0);
203 dy00 = _mm_sub_pd(iy0,jy0);
204 dz00 = _mm_sub_pd(iz0,jz0);
205 dx10 = _mm_sub_pd(ix1,jx0);
206 dy10 = _mm_sub_pd(iy1,jy0);
207 dz10 = _mm_sub_pd(iz1,jz0);
208 dx20 = _mm_sub_pd(ix2,jx0);
209 dy20 = _mm_sub_pd(iy2,jy0);
210 dz20 = _mm_sub_pd(iz2,jz0);
211 dx30 = _mm_sub_pd(ix3,jx0);
212 dy30 = _mm_sub_pd(iy3,jy0);
213 dz30 = _mm_sub_pd(iz3,jz0);
215 /* Calculate squared distance and things based on it */
216 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
217 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
218 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
219 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
221 rinv00 = avx128fma_invsqrt_d(rsq00);
222 rinv10 = avx128fma_invsqrt_d(rsq10);
223 rinv20 = avx128fma_invsqrt_d(rsq20);
224 rinv30 = avx128fma_invsqrt_d(rsq30);
226 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
227 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
228 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
230 /* Load parameters for j particles */
231 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
232 vdwjidx0A = 2*vdwtype[jnrA+0];
233 vdwjidx0B = 2*vdwtype[jnrB+0];
235 fjx0 = _mm_setzero_pd();
236 fjy0 = _mm_setzero_pd();
237 fjz0 = _mm_setzero_pd();
239 /**************************
240 * CALCULATE INTERACTIONS *
241 **************************/
243 r00 = _mm_mul_pd(rsq00,rinv00);
245 /* Compute parameters for interactions between i and j atoms */
246 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
247 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
249 /* Calculate table index by multiplying r with table scale and truncate to integer */
250 rt = _mm_mul_pd(r00,vftabscale);
251 vfitab = _mm_cvttpd_epi32(rt);
253 vfeps = _mm_frcz_pd(rt);
255 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
257 twovfeps = _mm_add_pd(vfeps,vfeps);
258 vfitab = _mm_slli_epi32(vfitab,3);
260 /* CUBIC SPLINE TABLE DISPERSION */
261 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
262 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
263 GMX_MM_TRANSPOSE2_PD(Y,F);
264 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
265 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
266 GMX_MM_TRANSPOSE2_PD(G,H);
267 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
268 VV = _mm_macc_pd(vfeps,Fp,Y);
269 vvdw6 = _mm_mul_pd(c6_00,VV);
270 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
271 fvdw6 = _mm_mul_pd(c6_00,FF);
273 /* CUBIC SPLINE TABLE REPULSION */
274 vfitab = _mm_add_epi32(vfitab,ifour);
275 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
276 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
277 GMX_MM_TRANSPOSE2_PD(Y,F);
278 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
279 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
280 GMX_MM_TRANSPOSE2_PD(G,H);
281 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
282 VV = _mm_macc_pd(vfeps,Fp,Y);
283 vvdw12 = _mm_mul_pd(c12_00,VV);
284 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
285 fvdw12 = _mm_mul_pd(c12_00,FF);
286 vvdw = _mm_add_pd(vvdw12,vvdw6);
287 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
289 /* Update potential sum for this i atom from the interaction with this j atom. */
290 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
294 /* Update vectorial force */
295 fix0 = _mm_macc_pd(dx00,fscal,fix0);
296 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
297 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
299 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
300 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
301 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
303 /**************************
304 * CALCULATE INTERACTIONS *
305 **************************/
307 r10 = _mm_mul_pd(rsq10,rinv10);
309 /* Compute parameters for interactions between i and j atoms */
310 qq10 = _mm_mul_pd(iq1,jq0);
312 /* EWALD ELECTROSTATICS */
314 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
315 ewrt = _mm_mul_pd(r10,ewtabscale);
316 ewitab = _mm_cvttpd_epi32(ewrt);
318 eweps = _mm_frcz_pd(ewrt);
320 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
322 twoeweps = _mm_add_pd(eweps,eweps);
323 ewitab = _mm_slli_epi32(ewitab,2);
324 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
325 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
326 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
327 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
328 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
329 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
330 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
331 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
332 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
333 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
335 /* Update potential sum for this i atom from the interaction with this j atom. */
336 velecsum = _mm_add_pd(velecsum,velec);
340 /* Update vectorial force */
341 fix1 = _mm_macc_pd(dx10,fscal,fix1);
342 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
343 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
345 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
346 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
347 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
349 /**************************
350 * CALCULATE INTERACTIONS *
351 **************************/
353 r20 = _mm_mul_pd(rsq20,rinv20);
355 /* Compute parameters for interactions between i and j atoms */
356 qq20 = _mm_mul_pd(iq2,jq0);
358 /* EWALD ELECTROSTATICS */
360 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
361 ewrt = _mm_mul_pd(r20,ewtabscale);
362 ewitab = _mm_cvttpd_epi32(ewrt);
364 eweps = _mm_frcz_pd(ewrt);
366 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
368 twoeweps = _mm_add_pd(eweps,eweps);
369 ewitab = _mm_slli_epi32(ewitab,2);
370 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
371 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
372 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
373 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
374 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
375 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
376 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
377 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
378 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
379 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
381 /* Update potential sum for this i atom from the interaction with this j atom. */
382 velecsum = _mm_add_pd(velecsum,velec);
386 /* Update vectorial force */
387 fix2 = _mm_macc_pd(dx20,fscal,fix2);
388 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
389 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
391 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
392 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
393 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
395 /**************************
396 * CALCULATE INTERACTIONS *
397 **************************/
399 r30 = _mm_mul_pd(rsq30,rinv30);
401 /* Compute parameters for interactions between i and j atoms */
402 qq30 = _mm_mul_pd(iq3,jq0);
404 /* EWALD ELECTROSTATICS */
406 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
407 ewrt = _mm_mul_pd(r30,ewtabscale);
408 ewitab = _mm_cvttpd_epi32(ewrt);
410 eweps = _mm_frcz_pd(ewrt);
412 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
414 twoeweps = _mm_add_pd(eweps,eweps);
415 ewitab = _mm_slli_epi32(ewitab,2);
416 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
417 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
418 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
419 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
420 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
421 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
422 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
423 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
424 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
425 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
427 /* Update potential sum for this i atom from the interaction with this j atom. */
428 velecsum = _mm_add_pd(velecsum,velec);
432 /* Update vectorial force */
433 fix3 = _mm_macc_pd(dx30,fscal,fix3);
434 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
435 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
437 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
438 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
439 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
441 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
443 /* Inner loop uses 194 flops */
450 j_coord_offsetA = DIM*jnrA;
452 /* load j atom coordinates */
453 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
456 /* Calculate displacement vector */
457 dx00 = _mm_sub_pd(ix0,jx0);
458 dy00 = _mm_sub_pd(iy0,jy0);
459 dz00 = _mm_sub_pd(iz0,jz0);
460 dx10 = _mm_sub_pd(ix1,jx0);
461 dy10 = _mm_sub_pd(iy1,jy0);
462 dz10 = _mm_sub_pd(iz1,jz0);
463 dx20 = _mm_sub_pd(ix2,jx0);
464 dy20 = _mm_sub_pd(iy2,jy0);
465 dz20 = _mm_sub_pd(iz2,jz0);
466 dx30 = _mm_sub_pd(ix3,jx0);
467 dy30 = _mm_sub_pd(iy3,jy0);
468 dz30 = _mm_sub_pd(iz3,jz0);
470 /* Calculate squared distance and things based on it */
471 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
472 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
473 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
474 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
476 rinv00 = avx128fma_invsqrt_d(rsq00);
477 rinv10 = avx128fma_invsqrt_d(rsq10);
478 rinv20 = avx128fma_invsqrt_d(rsq20);
479 rinv30 = avx128fma_invsqrt_d(rsq30);
481 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
482 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
483 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
485 /* Load parameters for j particles */
486 jq0 = _mm_load_sd(charge+jnrA+0);
487 vdwjidx0A = 2*vdwtype[jnrA+0];
489 fjx0 = _mm_setzero_pd();
490 fjy0 = _mm_setzero_pd();
491 fjz0 = _mm_setzero_pd();
493 /**************************
494 * CALCULATE INTERACTIONS *
495 **************************/
497 r00 = _mm_mul_pd(rsq00,rinv00);
499 /* Compute parameters for interactions between i and j atoms */
500 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
502 /* Calculate table index by multiplying r with table scale and truncate to integer */
503 rt = _mm_mul_pd(r00,vftabscale);
504 vfitab = _mm_cvttpd_epi32(rt);
506 vfeps = _mm_frcz_pd(rt);
508 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
510 twovfeps = _mm_add_pd(vfeps,vfeps);
511 vfitab = _mm_slli_epi32(vfitab,3);
513 /* CUBIC SPLINE TABLE DISPERSION */
514 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
515 F = _mm_setzero_pd();
516 GMX_MM_TRANSPOSE2_PD(Y,F);
517 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
518 H = _mm_setzero_pd();
519 GMX_MM_TRANSPOSE2_PD(G,H);
520 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
521 VV = _mm_macc_pd(vfeps,Fp,Y);
522 vvdw6 = _mm_mul_pd(c6_00,VV);
523 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
524 fvdw6 = _mm_mul_pd(c6_00,FF);
526 /* CUBIC SPLINE TABLE REPULSION */
527 vfitab = _mm_add_epi32(vfitab,ifour);
528 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
529 F = _mm_setzero_pd();
530 GMX_MM_TRANSPOSE2_PD(Y,F);
531 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
532 H = _mm_setzero_pd();
533 GMX_MM_TRANSPOSE2_PD(G,H);
534 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
535 VV = _mm_macc_pd(vfeps,Fp,Y);
536 vvdw12 = _mm_mul_pd(c12_00,VV);
537 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
538 fvdw12 = _mm_mul_pd(c12_00,FF);
539 vvdw = _mm_add_pd(vvdw12,vvdw6);
540 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
542 /* Update potential sum for this i atom from the interaction with this j atom. */
543 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
544 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
548 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
550 /* Update vectorial force */
551 fix0 = _mm_macc_pd(dx00,fscal,fix0);
552 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
553 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
555 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
556 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
557 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
559 /**************************
560 * CALCULATE INTERACTIONS *
561 **************************/
563 r10 = _mm_mul_pd(rsq10,rinv10);
565 /* Compute parameters for interactions between i and j atoms */
566 qq10 = _mm_mul_pd(iq1,jq0);
568 /* EWALD ELECTROSTATICS */
570 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
571 ewrt = _mm_mul_pd(r10,ewtabscale);
572 ewitab = _mm_cvttpd_epi32(ewrt);
574 eweps = _mm_frcz_pd(ewrt);
576 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
578 twoeweps = _mm_add_pd(eweps,eweps);
579 ewitab = _mm_slli_epi32(ewitab,2);
580 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
581 ewtabD = _mm_setzero_pd();
582 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
583 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
584 ewtabFn = _mm_setzero_pd();
585 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
586 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
587 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
588 velec = _mm_mul_pd(qq10,_mm_sub_pd(rinv10,velec));
589 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
591 /* Update potential sum for this i atom from the interaction with this j atom. */
592 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
593 velecsum = _mm_add_pd(velecsum,velec);
597 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
599 /* Update vectorial force */
600 fix1 = _mm_macc_pd(dx10,fscal,fix1);
601 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
602 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
604 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
605 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
606 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
608 /**************************
609 * CALCULATE INTERACTIONS *
610 **************************/
612 r20 = _mm_mul_pd(rsq20,rinv20);
614 /* Compute parameters for interactions between i and j atoms */
615 qq20 = _mm_mul_pd(iq2,jq0);
617 /* EWALD ELECTROSTATICS */
619 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
620 ewrt = _mm_mul_pd(r20,ewtabscale);
621 ewitab = _mm_cvttpd_epi32(ewrt);
623 eweps = _mm_frcz_pd(ewrt);
625 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
627 twoeweps = _mm_add_pd(eweps,eweps);
628 ewitab = _mm_slli_epi32(ewitab,2);
629 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
630 ewtabD = _mm_setzero_pd();
631 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
632 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
633 ewtabFn = _mm_setzero_pd();
634 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
635 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
636 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
637 velec = _mm_mul_pd(qq20,_mm_sub_pd(rinv20,velec));
638 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
640 /* Update potential sum for this i atom from the interaction with this j atom. */
641 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
642 velecsum = _mm_add_pd(velecsum,velec);
646 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
648 /* Update vectorial force */
649 fix2 = _mm_macc_pd(dx20,fscal,fix2);
650 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
651 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
653 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
654 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
655 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
657 /**************************
658 * CALCULATE INTERACTIONS *
659 **************************/
661 r30 = _mm_mul_pd(rsq30,rinv30);
663 /* Compute parameters for interactions between i and j atoms */
664 qq30 = _mm_mul_pd(iq3,jq0);
666 /* EWALD ELECTROSTATICS */
668 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
669 ewrt = _mm_mul_pd(r30,ewtabscale);
670 ewitab = _mm_cvttpd_epi32(ewrt);
672 eweps = _mm_frcz_pd(ewrt);
674 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
676 twoeweps = _mm_add_pd(eweps,eweps);
677 ewitab = _mm_slli_epi32(ewitab,2);
678 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
679 ewtabD = _mm_setzero_pd();
680 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
681 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
682 ewtabFn = _mm_setzero_pd();
683 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
684 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
685 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
686 velec = _mm_mul_pd(qq30,_mm_sub_pd(rinv30,velec));
687 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
689 /* Update potential sum for this i atom from the interaction with this j atom. */
690 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
691 velecsum = _mm_add_pd(velecsum,velec);
695 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
697 /* Update vectorial force */
698 fix3 = _mm_macc_pd(dx30,fscal,fix3);
699 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
700 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
702 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
703 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
704 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
706 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
708 /* Inner loop uses 194 flops */
711 /* End of innermost loop */
713 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
714 f+i_coord_offset,fshift+i_shift_offset);
717 /* Update potential energies */
718 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
719 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
721 /* Increment number of inner iterations */
722 inneriter += j_index_end - j_index_start;
724 /* Outer loop uses 26 flops */
727 /* Increment number of outer iterations */
730 /* Update outer/inner flops */
732 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*194);
735 * Gromacs nonbonded kernel: nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_double
736 * Electrostatics interaction: Ewald
737 * VdW interaction: CubicSplineTable
738 * Geometry: Water4-Particle
739 * Calculate force/pot: Force
742 nb_kernel_ElecEw_VdwCSTab_GeomW4P1_F_avx_128_fma_double
743 (t_nblist * gmx_restrict nlist,
744 rvec * gmx_restrict xx,
745 rvec * gmx_restrict ff,
746 struct t_forcerec * gmx_restrict fr,
747 t_mdatoms * gmx_restrict mdatoms,
748 nb_kernel_data_t gmx_unused * gmx_restrict kernel_data,
749 t_nrnb * gmx_restrict nrnb)
751 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
752 * just 0 for non-waters.
753 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
754 * jnr indices corresponding to data put in the four positions in the SIMD register.
756 int i_shift_offset,i_coord_offset,outeriter,inneriter;
757 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
759 int j_coord_offsetA,j_coord_offsetB;
760 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
762 real *shiftvec,*fshift,*x,*f;
763 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
765 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
767 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
769 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
771 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
772 int vdwjidx0A,vdwjidx0B;
773 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
774 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
775 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
776 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
777 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
778 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
781 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
784 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
785 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
787 __m128i ifour = _mm_set1_epi32(4);
788 __m128d rt,vfeps,vftabscale,Y,F,G,H,Heps,Fp,VV,FF,twovfeps;
791 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
793 __m128d dummy_mask,cutoff_mask;
794 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
795 __m128d one = _mm_set1_pd(1.0);
796 __m128d two = _mm_set1_pd(2.0);
802 jindex = nlist->jindex;
804 shiftidx = nlist->shift;
806 shiftvec = fr->shift_vec[0];
807 fshift = fr->fshift[0];
808 facel = _mm_set1_pd(fr->ic->epsfac);
809 charge = mdatoms->chargeA;
810 nvdwtype = fr->ntype;
812 vdwtype = mdatoms->typeA;
814 vftab = kernel_data->table_vdw->data;
815 vftabscale = _mm_set1_pd(kernel_data->table_vdw->scale);
817 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
818 ewtab = fr->ic->tabq_coul_F;
819 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
820 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
822 /* Setup water-specific parameters */
823 inr = nlist->iinr[0];
824 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
825 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
826 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
827 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
829 /* Avoid stupid compiler warnings */
837 /* Start outer loop over neighborlists */
838 for(iidx=0; iidx<nri; iidx++)
840 /* Load shift vector for this list */
841 i_shift_offset = DIM*shiftidx[iidx];
843 /* Load limits for loop over neighbors */
844 j_index_start = jindex[iidx];
845 j_index_end = jindex[iidx+1];
847 /* Get outer coordinate index */
849 i_coord_offset = DIM*inr;
851 /* Load i particle coords and add shift vector */
852 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
853 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
855 fix0 = _mm_setzero_pd();
856 fiy0 = _mm_setzero_pd();
857 fiz0 = _mm_setzero_pd();
858 fix1 = _mm_setzero_pd();
859 fiy1 = _mm_setzero_pd();
860 fiz1 = _mm_setzero_pd();
861 fix2 = _mm_setzero_pd();
862 fiy2 = _mm_setzero_pd();
863 fiz2 = _mm_setzero_pd();
864 fix3 = _mm_setzero_pd();
865 fiy3 = _mm_setzero_pd();
866 fiz3 = _mm_setzero_pd();
868 /* Start inner kernel loop */
869 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
872 /* Get j neighbor index, and coordinate index */
875 j_coord_offsetA = DIM*jnrA;
876 j_coord_offsetB = DIM*jnrB;
878 /* load j atom coordinates */
879 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
882 /* Calculate displacement vector */
883 dx00 = _mm_sub_pd(ix0,jx0);
884 dy00 = _mm_sub_pd(iy0,jy0);
885 dz00 = _mm_sub_pd(iz0,jz0);
886 dx10 = _mm_sub_pd(ix1,jx0);
887 dy10 = _mm_sub_pd(iy1,jy0);
888 dz10 = _mm_sub_pd(iz1,jz0);
889 dx20 = _mm_sub_pd(ix2,jx0);
890 dy20 = _mm_sub_pd(iy2,jy0);
891 dz20 = _mm_sub_pd(iz2,jz0);
892 dx30 = _mm_sub_pd(ix3,jx0);
893 dy30 = _mm_sub_pd(iy3,jy0);
894 dz30 = _mm_sub_pd(iz3,jz0);
896 /* Calculate squared distance and things based on it */
897 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
898 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
899 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
900 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
902 rinv00 = avx128fma_invsqrt_d(rsq00);
903 rinv10 = avx128fma_invsqrt_d(rsq10);
904 rinv20 = avx128fma_invsqrt_d(rsq20);
905 rinv30 = avx128fma_invsqrt_d(rsq30);
907 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
908 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
909 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
911 /* Load parameters for j particles */
912 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
913 vdwjidx0A = 2*vdwtype[jnrA+0];
914 vdwjidx0B = 2*vdwtype[jnrB+0];
916 fjx0 = _mm_setzero_pd();
917 fjy0 = _mm_setzero_pd();
918 fjz0 = _mm_setzero_pd();
920 /**************************
921 * CALCULATE INTERACTIONS *
922 **************************/
924 r00 = _mm_mul_pd(rsq00,rinv00);
926 /* Compute parameters for interactions between i and j atoms */
927 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
928 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
930 /* Calculate table index by multiplying r with table scale and truncate to integer */
931 rt = _mm_mul_pd(r00,vftabscale);
932 vfitab = _mm_cvttpd_epi32(rt);
934 vfeps = _mm_frcz_pd(rt);
936 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
938 twovfeps = _mm_add_pd(vfeps,vfeps);
939 vfitab = _mm_slli_epi32(vfitab,3);
941 /* CUBIC SPLINE TABLE DISPERSION */
942 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
943 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
944 GMX_MM_TRANSPOSE2_PD(Y,F);
945 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
946 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
947 GMX_MM_TRANSPOSE2_PD(G,H);
948 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
949 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
950 fvdw6 = _mm_mul_pd(c6_00,FF);
952 /* CUBIC SPLINE TABLE REPULSION */
953 vfitab = _mm_add_epi32(vfitab,ifour);
954 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
955 F = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) );
956 GMX_MM_TRANSPOSE2_PD(Y,F);
957 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
958 H = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,1) +2);
959 GMX_MM_TRANSPOSE2_PD(G,H);
960 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
961 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
962 fvdw12 = _mm_mul_pd(c12_00,FF);
963 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
967 /* Update vectorial force */
968 fix0 = _mm_macc_pd(dx00,fscal,fix0);
969 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
970 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
972 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
973 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
974 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
976 /**************************
977 * CALCULATE INTERACTIONS *
978 **************************/
980 r10 = _mm_mul_pd(rsq10,rinv10);
982 /* Compute parameters for interactions between i and j atoms */
983 qq10 = _mm_mul_pd(iq1,jq0);
985 /* EWALD ELECTROSTATICS */
987 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
988 ewrt = _mm_mul_pd(r10,ewtabscale);
989 ewitab = _mm_cvttpd_epi32(ewrt);
991 eweps = _mm_frcz_pd(ewrt);
993 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
995 twoeweps = _mm_add_pd(eweps,eweps);
996 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
998 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
999 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1003 /* Update vectorial force */
1004 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1005 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1006 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1008 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1009 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1010 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1012 /**************************
1013 * CALCULATE INTERACTIONS *
1014 **************************/
1016 r20 = _mm_mul_pd(rsq20,rinv20);
1018 /* Compute parameters for interactions between i and j atoms */
1019 qq20 = _mm_mul_pd(iq2,jq0);
1021 /* EWALD ELECTROSTATICS */
1023 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1024 ewrt = _mm_mul_pd(r20,ewtabscale);
1025 ewitab = _mm_cvttpd_epi32(ewrt);
1027 eweps = _mm_frcz_pd(ewrt);
1029 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1031 twoeweps = _mm_add_pd(eweps,eweps);
1032 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1034 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1035 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1039 /* Update vectorial force */
1040 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1041 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1042 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1044 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1045 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1046 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1048 /**************************
1049 * CALCULATE INTERACTIONS *
1050 **************************/
1052 r30 = _mm_mul_pd(rsq30,rinv30);
1054 /* Compute parameters for interactions between i and j atoms */
1055 qq30 = _mm_mul_pd(iq3,jq0);
1057 /* EWALD ELECTROSTATICS */
1059 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1060 ewrt = _mm_mul_pd(r30,ewtabscale);
1061 ewitab = _mm_cvttpd_epi32(ewrt);
1063 eweps = _mm_frcz_pd(ewrt);
1065 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1067 twoeweps = _mm_add_pd(eweps,eweps);
1068 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1070 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1071 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1075 /* Update vectorial force */
1076 fix3 = _mm_macc_pd(dx30,fscal,fix3);
1077 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
1078 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
1080 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
1081 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
1082 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
1084 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1086 /* Inner loop uses 171 flops */
1089 if(jidx<j_index_end)
1093 j_coord_offsetA = DIM*jnrA;
1095 /* load j atom coordinates */
1096 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1099 /* Calculate displacement vector */
1100 dx00 = _mm_sub_pd(ix0,jx0);
1101 dy00 = _mm_sub_pd(iy0,jy0);
1102 dz00 = _mm_sub_pd(iz0,jz0);
1103 dx10 = _mm_sub_pd(ix1,jx0);
1104 dy10 = _mm_sub_pd(iy1,jy0);
1105 dz10 = _mm_sub_pd(iz1,jz0);
1106 dx20 = _mm_sub_pd(ix2,jx0);
1107 dy20 = _mm_sub_pd(iy2,jy0);
1108 dz20 = _mm_sub_pd(iz2,jz0);
1109 dx30 = _mm_sub_pd(ix3,jx0);
1110 dy30 = _mm_sub_pd(iy3,jy0);
1111 dz30 = _mm_sub_pd(iz3,jz0);
1113 /* Calculate squared distance and things based on it */
1114 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1115 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1116 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1117 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1119 rinv00 = avx128fma_invsqrt_d(rsq00);
1120 rinv10 = avx128fma_invsqrt_d(rsq10);
1121 rinv20 = avx128fma_invsqrt_d(rsq20);
1122 rinv30 = avx128fma_invsqrt_d(rsq30);
1124 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1125 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1126 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
1128 /* Load parameters for j particles */
1129 jq0 = _mm_load_sd(charge+jnrA+0);
1130 vdwjidx0A = 2*vdwtype[jnrA+0];
1132 fjx0 = _mm_setzero_pd();
1133 fjy0 = _mm_setzero_pd();
1134 fjz0 = _mm_setzero_pd();
1136 /**************************
1137 * CALCULATE INTERACTIONS *
1138 **************************/
1140 r00 = _mm_mul_pd(rsq00,rinv00);
1142 /* Compute parameters for interactions between i and j atoms */
1143 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
1145 /* Calculate table index by multiplying r with table scale and truncate to integer */
1146 rt = _mm_mul_pd(r00,vftabscale);
1147 vfitab = _mm_cvttpd_epi32(rt);
1149 vfeps = _mm_frcz_pd(rt);
1151 vfeps = _mm_sub_pd(rt,_mm_round_pd(rt, _MM_FROUND_FLOOR));
1153 twovfeps = _mm_add_pd(vfeps,vfeps);
1154 vfitab = _mm_slli_epi32(vfitab,3);
1156 /* CUBIC SPLINE TABLE DISPERSION */
1157 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1158 F = _mm_setzero_pd();
1159 GMX_MM_TRANSPOSE2_PD(Y,F);
1160 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1161 H = _mm_setzero_pd();
1162 GMX_MM_TRANSPOSE2_PD(G,H);
1163 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1164 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1165 fvdw6 = _mm_mul_pd(c6_00,FF);
1167 /* CUBIC SPLINE TABLE REPULSION */
1168 vfitab = _mm_add_epi32(vfitab,ifour);
1169 Y = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) );
1170 F = _mm_setzero_pd();
1171 GMX_MM_TRANSPOSE2_PD(Y,F);
1172 G = _mm_load_pd( vftab + _mm_extract_epi32(vfitab,0) +2);
1173 H = _mm_setzero_pd();
1174 GMX_MM_TRANSPOSE2_PD(G,H);
1175 Fp = _mm_macc_pd(vfeps,_mm_macc_pd(H,vfeps,G),F);
1176 FF = _mm_macc_pd(vfeps,_mm_macc_pd(twovfeps,H,G),Fp);
1177 fvdw12 = _mm_mul_pd(c12_00,FF);
1178 fvdw = _mm_xor_pd(signbit,_mm_mul_pd(_mm_add_pd(fvdw6,fvdw12),_mm_mul_pd(vftabscale,rinv00)));
1182 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1184 /* Update vectorial force */
1185 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1186 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1187 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1189 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1190 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1191 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1193 /**************************
1194 * CALCULATE INTERACTIONS *
1195 **************************/
1197 r10 = _mm_mul_pd(rsq10,rinv10);
1199 /* Compute parameters for interactions between i and j atoms */
1200 qq10 = _mm_mul_pd(iq1,jq0);
1202 /* EWALD ELECTROSTATICS */
1204 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1205 ewrt = _mm_mul_pd(r10,ewtabscale);
1206 ewitab = _mm_cvttpd_epi32(ewrt);
1208 eweps = _mm_frcz_pd(ewrt);
1210 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1212 twoeweps = _mm_add_pd(eweps,eweps);
1213 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1214 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1215 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1219 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1221 /* Update vectorial force */
1222 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1223 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1224 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1226 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1227 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1228 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1230 /**************************
1231 * CALCULATE INTERACTIONS *
1232 **************************/
1234 r20 = _mm_mul_pd(rsq20,rinv20);
1236 /* Compute parameters for interactions between i and j atoms */
1237 qq20 = _mm_mul_pd(iq2,jq0);
1239 /* EWALD ELECTROSTATICS */
1241 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1242 ewrt = _mm_mul_pd(r20,ewtabscale);
1243 ewitab = _mm_cvttpd_epi32(ewrt);
1245 eweps = _mm_frcz_pd(ewrt);
1247 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1249 twoeweps = _mm_add_pd(eweps,eweps);
1250 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1251 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1252 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1256 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1258 /* Update vectorial force */
1259 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1260 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1261 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1263 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1264 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1265 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1267 /**************************
1268 * CALCULATE INTERACTIONS *
1269 **************************/
1271 r30 = _mm_mul_pd(rsq30,rinv30);
1273 /* Compute parameters for interactions between i and j atoms */
1274 qq30 = _mm_mul_pd(iq3,jq0);
1276 /* EWALD ELECTROSTATICS */
1278 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1279 ewrt = _mm_mul_pd(r30,ewtabscale);
1280 ewitab = _mm_cvttpd_epi32(ewrt);
1282 eweps = _mm_frcz_pd(ewrt);
1284 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1286 twoeweps = _mm_add_pd(eweps,eweps);
1287 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1288 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1289 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1293 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1295 /* Update vectorial force */
1296 fix3 = _mm_macc_pd(dx30,fscal,fix3);
1297 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
1298 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
1300 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
1301 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
1302 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
1304 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1306 /* Inner loop uses 171 flops */
1309 /* End of innermost loop */
1311 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1312 f+i_coord_offset,fshift+i_shift_offset);
1314 /* Increment number of inner iterations */
1315 inneriter += j_index_end - j_index_start;
1317 /* Outer loop uses 24 flops */
1320 /* Increment number of outer iterations */
1323 /* Update outer/inner flops */
1325 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*171);