2 * Note: this file was generated by the Gromacs avx_128_fma_double kernel generator.
4 * This source code is part of
8 * Copyright (c) 2001-2012, The GROMACS Development Team
10 * Gromacs is a library for molecular simulation and trajectory analysis,
11 * written by Erik Lindahl, David van der Spoel, Berk Hess, and others - for
12 * a full list of developers and information, check out http://www.gromacs.org
14 * This program is free software; you can redistribute it and/or modify it under
15 * the terms of the GNU Lesser General Public License as published by the Free
16 * Software Foundation; either version 2 of the License, or (at your option) any
19 * To help fund GROMACS development, we humbly ask that you cite
20 * the papers people have written on it - you can find them on the website.
28 #include "../nb_kernel.h"
29 #include "types/simple.h"
33 #include "gmx_math_x86_avx_128_fma_double.h"
34 #include "kernelutil_x86_avx_128_fma_double.h"
37 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_double
38 * Electrostatics interaction: Ewald
39 * VdW interaction: LennardJones
40 * Geometry: Water4-Particle
41 * Calculate force/pot: PotentialAndForce
44 nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_VF_avx_128_fma_double
45 (t_nblist * gmx_restrict nlist,
46 rvec * gmx_restrict xx,
47 rvec * gmx_restrict ff,
48 t_forcerec * gmx_restrict fr,
49 t_mdatoms * gmx_restrict mdatoms,
50 nb_kernel_data_t * gmx_restrict kernel_data,
51 t_nrnb * gmx_restrict nrnb)
53 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
54 * just 0 for non-waters.
55 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
56 * jnr indices corresponding to data put in the four positions in the SIMD register.
58 int i_shift_offset,i_coord_offset,outeriter,inneriter;
59 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
61 int j_coord_offsetA,j_coord_offsetB;
62 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
64 real *shiftvec,*fshift,*x,*f;
65 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
67 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
69 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
71 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
73 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
74 int vdwjidx0A,vdwjidx0B;
75 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
76 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
77 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
78 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
79 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
80 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
83 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
86 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
87 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
89 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
91 __m128d dummy_mask,cutoff_mask;
92 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
93 __m128d one = _mm_set1_pd(1.0);
94 __m128d two = _mm_set1_pd(2.0);
100 jindex = nlist->jindex;
102 shiftidx = nlist->shift;
104 shiftvec = fr->shift_vec[0];
105 fshift = fr->fshift[0];
106 facel = _mm_set1_pd(fr->epsfac);
107 charge = mdatoms->chargeA;
108 nvdwtype = fr->ntype;
110 vdwtype = mdatoms->typeA;
112 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
113 ewtab = fr->ic->tabq_coul_FDV0;
114 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
115 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
117 /* Setup water-specific parameters */
118 inr = nlist->iinr[0];
119 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
120 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
121 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
122 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
124 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
125 rcutoff_scalar = fr->rcoulomb;
126 rcutoff = _mm_set1_pd(rcutoff_scalar);
127 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
129 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
130 rvdw = _mm_set1_pd(fr->rvdw);
132 /* Avoid stupid compiler warnings */
140 /* Start outer loop over neighborlists */
141 for(iidx=0; iidx<nri; iidx++)
143 /* Load shift vector for this list */
144 i_shift_offset = DIM*shiftidx[iidx];
146 /* Load limits for loop over neighbors */
147 j_index_start = jindex[iidx];
148 j_index_end = jindex[iidx+1];
150 /* Get outer coordinate index */
152 i_coord_offset = DIM*inr;
154 /* Load i particle coords and add shift vector */
155 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
156 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
158 fix0 = _mm_setzero_pd();
159 fiy0 = _mm_setzero_pd();
160 fiz0 = _mm_setzero_pd();
161 fix1 = _mm_setzero_pd();
162 fiy1 = _mm_setzero_pd();
163 fiz1 = _mm_setzero_pd();
164 fix2 = _mm_setzero_pd();
165 fiy2 = _mm_setzero_pd();
166 fiz2 = _mm_setzero_pd();
167 fix3 = _mm_setzero_pd();
168 fiy3 = _mm_setzero_pd();
169 fiz3 = _mm_setzero_pd();
171 /* Reset potential sums */
172 velecsum = _mm_setzero_pd();
173 vvdwsum = _mm_setzero_pd();
175 /* Start inner kernel loop */
176 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
179 /* Get j neighbor index, and coordinate index */
182 j_coord_offsetA = DIM*jnrA;
183 j_coord_offsetB = DIM*jnrB;
185 /* load j atom coordinates */
186 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
189 /* Calculate displacement vector */
190 dx00 = _mm_sub_pd(ix0,jx0);
191 dy00 = _mm_sub_pd(iy0,jy0);
192 dz00 = _mm_sub_pd(iz0,jz0);
193 dx10 = _mm_sub_pd(ix1,jx0);
194 dy10 = _mm_sub_pd(iy1,jy0);
195 dz10 = _mm_sub_pd(iz1,jz0);
196 dx20 = _mm_sub_pd(ix2,jx0);
197 dy20 = _mm_sub_pd(iy2,jy0);
198 dz20 = _mm_sub_pd(iz2,jz0);
199 dx30 = _mm_sub_pd(ix3,jx0);
200 dy30 = _mm_sub_pd(iy3,jy0);
201 dz30 = _mm_sub_pd(iz3,jz0);
203 /* Calculate squared distance and things based on it */
204 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
205 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
206 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
207 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
209 rinv10 = gmx_mm_invsqrt_pd(rsq10);
210 rinv20 = gmx_mm_invsqrt_pd(rsq20);
211 rinv30 = gmx_mm_invsqrt_pd(rsq30);
213 rinvsq00 = gmx_mm_inv_pd(rsq00);
214 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
215 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
216 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
218 /* Load parameters for j particles */
219 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
220 vdwjidx0A = 2*vdwtype[jnrA+0];
221 vdwjidx0B = 2*vdwtype[jnrB+0];
223 fjx0 = _mm_setzero_pd();
224 fjy0 = _mm_setzero_pd();
225 fjz0 = _mm_setzero_pd();
227 /**************************
228 * CALCULATE INTERACTIONS *
229 **************************/
231 if (gmx_mm_any_lt(rsq00,rcutoff2))
234 /* Compute parameters for interactions between i and j atoms */
235 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
236 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
238 /* LENNARD-JONES DISPERSION/REPULSION */
240 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
241 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
242 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
243 vvdw = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
244 _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
245 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
247 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
249 /* Update potential sum for this i atom from the interaction with this j atom. */
250 vvdw = _mm_and_pd(vvdw,cutoff_mask);
251 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
255 fscal = _mm_and_pd(fscal,cutoff_mask);
257 /* Update vectorial force */
258 fix0 = _mm_macc_pd(dx00,fscal,fix0);
259 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
260 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
262 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
263 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
264 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
268 /**************************
269 * CALCULATE INTERACTIONS *
270 **************************/
272 if (gmx_mm_any_lt(rsq10,rcutoff2))
275 r10 = _mm_mul_pd(rsq10,rinv10);
277 /* Compute parameters for interactions between i and j atoms */
278 qq10 = _mm_mul_pd(iq1,jq0);
280 /* EWALD ELECTROSTATICS */
282 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
283 ewrt = _mm_mul_pd(r10,ewtabscale);
284 ewitab = _mm_cvttpd_epi32(ewrt);
286 eweps = _mm_frcz_pd(ewrt);
288 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
290 twoeweps = _mm_add_pd(eweps,eweps);
291 ewitab = _mm_slli_epi32(ewitab,2);
292 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
293 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
294 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
295 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
296 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
297 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
298 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
299 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
300 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
301 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
303 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
305 /* Update potential sum for this i atom from the interaction with this j atom. */
306 velec = _mm_and_pd(velec,cutoff_mask);
307 velecsum = _mm_add_pd(velecsum,velec);
311 fscal = _mm_and_pd(fscal,cutoff_mask);
313 /* Update vectorial force */
314 fix1 = _mm_macc_pd(dx10,fscal,fix1);
315 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
316 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
318 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
319 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
320 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
324 /**************************
325 * CALCULATE INTERACTIONS *
326 **************************/
328 if (gmx_mm_any_lt(rsq20,rcutoff2))
331 r20 = _mm_mul_pd(rsq20,rinv20);
333 /* Compute parameters for interactions between i and j atoms */
334 qq20 = _mm_mul_pd(iq2,jq0);
336 /* EWALD ELECTROSTATICS */
338 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
339 ewrt = _mm_mul_pd(r20,ewtabscale);
340 ewitab = _mm_cvttpd_epi32(ewrt);
342 eweps = _mm_frcz_pd(ewrt);
344 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
346 twoeweps = _mm_add_pd(eweps,eweps);
347 ewitab = _mm_slli_epi32(ewitab,2);
348 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
349 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
350 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
351 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
352 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
353 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
354 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
355 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
356 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
357 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
359 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
361 /* Update potential sum for this i atom from the interaction with this j atom. */
362 velec = _mm_and_pd(velec,cutoff_mask);
363 velecsum = _mm_add_pd(velecsum,velec);
367 fscal = _mm_and_pd(fscal,cutoff_mask);
369 /* Update vectorial force */
370 fix2 = _mm_macc_pd(dx20,fscal,fix2);
371 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
372 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
374 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
375 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
376 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
380 /**************************
381 * CALCULATE INTERACTIONS *
382 **************************/
384 if (gmx_mm_any_lt(rsq30,rcutoff2))
387 r30 = _mm_mul_pd(rsq30,rinv30);
389 /* Compute parameters for interactions between i and j atoms */
390 qq30 = _mm_mul_pd(iq3,jq0);
392 /* EWALD ELECTROSTATICS */
394 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
395 ewrt = _mm_mul_pd(r30,ewtabscale);
396 ewitab = _mm_cvttpd_epi32(ewrt);
398 eweps = _mm_frcz_pd(ewrt);
400 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
402 twoeweps = _mm_add_pd(eweps,eweps);
403 ewitab = _mm_slli_epi32(ewitab,2);
404 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
405 ewtabD = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,1) );
406 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
407 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
408 ewtabFn = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,1) +2);
409 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
410 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
411 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
412 velec = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
413 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
415 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
417 /* Update potential sum for this i atom from the interaction with this j atom. */
418 velec = _mm_and_pd(velec,cutoff_mask);
419 velecsum = _mm_add_pd(velecsum,velec);
423 fscal = _mm_and_pd(fscal,cutoff_mask);
425 /* Update vectorial force */
426 fix3 = _mm_macc_pd(dx30,fscal,fix3);
427 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
428 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
430 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
431 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
432 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
436 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
438 /* Inner loop uses 194 flops */
445 j_coord_offsetA = DIM*jnrA;
447 /* load j atom coordinates */
448 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
451 /* Calculate displacement vector */
452 dx00 = _mm_sub_pd(ix0,jx0);
453 dy00 = _mm_sub_pd(iy0,jy0);
454 dz00 = _mm_sub_pd(iz0,jz0);
455 dx10 = _mm_sub_pd(ix1,jx0);
456 dy10 = _mm_sub_pd(iy1,jy0);
457 dz10 = _mm_sub_pd(iz1,jz0);
458 dx20 = _mm_sub_pd(ix2,jx0);
459 dy20 = _mm_sub_pd(iy2,jy0);
460 dz20 = _mm_sub_pd(iz2,jz0);
461 dx30 = _mm_sub_pd(ix3,jx0);
462 dy30 = _mm_sub_pd(iy3,jy0);
463 dz30 = _mm_sub_pd(iz3,jz0);
465 /* Calculate squared distance and things based on it */
466 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
467 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
468 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
469 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
471 rinv10 = gmx_mm_invsqrt_pd(rsq10);
472 rinv20 = gmx_mm_invsqrt_pd(rsq20);
473 rinv30 = gmx_mm_invsqrt_pd(rsq30);
475 rinvsq00 = gmx_mm_inv_pd(rsq00);
476 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
477 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
478 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
480 /* Load parameters for j particles */
481 jq0 = _mm_load_sd(charge+jnrA+0);
482 vdwjidx0A = 2*vdwtype[jnrA+0];
484 fjx0 = _mm_setzero_pd();
485 fjy0 = _mm_setzero_pd();
486 fjz0 = _mm_setzero_pd();
488 /**************************
489 * CALCULATE INTERACTIONS *
490 **************************/
492 if (gmx_mm_any_lt(rsq00,rcutoff2))
495 /* Compute parameters for interactions between i and j atoms */
496 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
498 /* LENNARD-JONES DISPERSION/REPULSION */
500 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
501 vvdw6 = _mm_mul_pd(c6_00,rinvsix);
502 vvdw12 = _mm_mul_pd(c12_00,_mm_mul_pd(rinvsix,rinvsix));
503 vvdw = _mm_msub_pd(_mm_nmacc_pd(c12_00,_mm_mul_pd(sh_vdw_invrcut6,sh_vdw_invrcut6),vvdw12),one_twelfth,
504 _mm_mul_pd(_mm_nmacc_pd( c6_00,sh_vdw_invrcut6,vvdw6),one_sixth));
505 fvdw = _mm_mul_pd(_mm_sub_pd(vvdw12,vvdw6),rinvsq00);
507 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
509 /* Update potential sum for this i atom from the interaction with this j atom. */
510 vvdw = _mm_and_pd(vvdw,cutoff_mask);
511 vvdw = _mm_unpacklo_pd(vvdw,_mm_setzero_pd());
512 vvdwsum = _mm_add_pd(vvdwsum,vvdw);
516 fscal = _mm_and_pd(fscal,cutoff_mask);
518 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
520 /* Update vectorial force */
521 fix0 = _mm_macc_pd(dx00,fscal,fix0);
522 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
523 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
525 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
526 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
527 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
531 /**************************
532 * CALCULATE INTERACTIONS *
533 **************************/
535 if (gmx_mm_any_lt(rsq10,rcutoff2))
538 r10 = _mm_mul_pd(rsq10,rinv10);
540 /* Compute parameters for interactions between i and j atoms */
541 qq10 = _mm_mul_pd(iq1,jq0);
543 /* EWALD ELECTROSTATICS */
545 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
546 ewrt = _mm_mul_pd(r10,ewtabscale);
547 ewitab = _mm_cvttpd_epi32(ewrt);
549 eweps = _mm_frcz_pd(ewrt);
551 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
553 twoeweps = _mm_add_pd(eweps,eweps);
554 ewitab = _mm_slli_epi32(ewitab,2);
555 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
556 ewtabD = _mm_setzero_pd();
557 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
558 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
559 ewtabFn = _mm_setzero_pd();
560 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
561 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
562 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
563 velec = _mm_mul_pd(qq10,_mm_sub_pd(_mm_sub_pd(rinv10,sh_ewald),velec));
564 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
566 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
568 /* Update potential sum for this i atom from the interaction with this j atom. */
569 velec = _mm_and_pd(velec,cutoff_mask);
570 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
571 velecsum = _mm_add_pd(velecsum,velec);
575 fscal = _mm_and_pd(fscal,cutoff_mask);
577 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
579 /* Update vectorial force */
580 fix1 = _mm_macc_pd(dx10,fscal,fix1);
581 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
582 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
584 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
585 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
586 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
590 /**************************
591 * CALCULATE INTERACTIONS *
592 **************************/
594 if (gmx_mm_any_lt(rsq20,rcutoff2))
597 r20 = _mm_mul_pd(rsq20,rinv20);
599 /* Compute parameters for interactions between i and j atoms */
600 qq20 = _mm_mul_pd(iq2,jq0);
602 /* EWALD ELECTROSTATICS */
604 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
605 ewrt = _mm_mul_pd(r20,ewtabscale);
606 ewitab = _mm_cvttpd_epi32(ewrt);
608 eweps = _mm_frcz_pd(ewrt);
610 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
612 twoeweps = _mm_add_pd(eweps,eweps);
613 ewitab = _mm_slli_epi32(ewitab,2);
614 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
615 ewtabD = _mm_setzero_pd();
616 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
617 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
618 ewtabFn = _mm_setzero_pd();
619 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
620 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
621 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
622 velec = _mm_mul_pd(qq20,_mm_sub_pd(_mm_sub_pd(rinv20,sh_ewald),velec));
623 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
625 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
627 /* Update potential sum for this i atom from the interaction with this j atom. */
628 velec = _mm_and_pd(velec,cutoff_mask);
629 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
630 velecsum = _mm_add_pd(velecsum,velec);
634 fscal = _mm_and_pd(fscal,cutoff_mask);
636 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
638 /* Update vectorial force */
639 fix2 = _mm_macc_pd(dx20,fscal,fix2);
640 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
641 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
643 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
644 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
645 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
649 /**************************
650 * CALCULATE INTERACTIONS *
651 **************************/
653 if (gmx_mm_any_lt(rsq30,rcutoff2))
656 r30 = _mm_mul_pd(rsq30,rinv30);
658 /* Compute parameters for interactions between i and j atoms */
659 qq30 = _mm_mul_pd(iq3,jq0);
661 /* EWALD ELECTROSTATICS */
663 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
664 ewrt = _mm_mul_pd(r30,ewtabscale);
665 ewitab = _mm_cvttpd_epi32(ewrt);
667 eweps = _mm_frcz_pd(ewrt);
669 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
671 twoeweps = _mm_add_pd(eweps,eweps);
672 ewitab = _mm_slli_epi32(ewitab,2);
673 ewtabF = _mm_load_pd( ewtab + _mm_extract_epi32(ewitab,0) );
674 ewtabD = _mm_setzero_pd();
675 GMX_MM_TRANSPOSE2_PD(ewtabF,ewtabD);
676 ewtabV = _mm_load_sd( ewtab + _mm_extract_epi32(ewitab,0) +2);
677 ewtabFn = _mm_setzero_pd();
678 GMX_MM_TRANSPOSE2_PD(ewtabV,ewtabFn);
679 felec = _mm_macc_pd(eweps,ewtabD,ewtabF);
680 velec = _mm_nmacc_pd(_mm_mul_pd(ewtabhalfspace,eweps) ,_mm_add_pd(ewtabF,felec), ewtabV);
681 velec = _mm_mul_pd(qq30,_mm_sub_pd(_mm_sub_pd(rinv30,sh_ewald),velec));
682 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
684 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
686 /* Update potential sum for this i atom from the interaction with this j atom. */
687 velec = _mm_and_pd(velec,cutoff_mask);
688 velec = _mm_unpacklo_pd(velec,_mm_setzero_pd());
689 velecsum = _mm_add_pd(velecsum,velec);
693 fscal = _mm_and_pd(fscal,cutoff_mask);
695 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
697 /* Update vectorial force */
698 fix3 = _mm_macc_pd(dx30,fscal,fix3);
699 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
700 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
702 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
703 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
704 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
708 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
710 /* Inner loop uses 194 flops */
713 /* End of innermost loop */
715 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
716 f+i_coord_offset,fshift+i_shift_offset);
719 /* Update potential energies */
720 gmx_mm_update_1pot_pd(velecsum,kernel_data->energygrp_elec+ggid);
721 gmx_mm_update_1pot_pd(vvdwsum,kernel_data->energygrp_vdw+ggid);
723 /* Increment number of inner iterations */
724 inneriter += j_index_end - j_index_start;
726 /* Outer loop uses 26 flops */
729 /* Increment number of outer iterations */
732 /* Update outer/inner flops */
734 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_VF,outeriter*26 + inneriter*194);
737 * Gromacs nonbonded kernel: nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_double
738 * Electrostatics interaction: Ewald
739 * VdW interaction: LennardJones
740 * Geometry: Water4-Particle
741 * Calculate force/pot: Force
744 nb_kernel_ElecEwSh_VdwLJSh_GeomW4P1_F_avx_128_fma_double
745 (t_nblist * gmx_restrict nlist,
746 rvec * gmx_restrict xx,
747 rvec * gmx_restrict ff,
748 t_forcerec * gmx_restrict fr,
749 t_mdatoms * gmx_restrict mdatoms,
750 nb_kernel_data_t * gmx_restrict kernel_data,
751 t_nrnb * gmx_restrict nrnb)
753 /* Suffixes 0,1,2,3 refer to particle indices for waters in the inner or outer loop, or
754 * just 0 for non-waters.
755 * Suffixes A,B refer to j loop unrolling done with SSE double precision, e.g. for the two different
756 * jnr indices corresponding to data put in the four positions in the SIMD register.
758 int i_shift_offset,i_coord_offset,outeriter,inneriter;
759 int j_index_start,j_index_end,jidx,nri,inr,ggid,iidx;
761 int j_coord_offsetA,j_coord_offsetB;
762 int *iinr,*jindex,*jjnr,*shiftidx,*gid;
764 real *shiftvec,*fshift,*x,*f;
765 __m128d tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
767 __m128d ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
769 __m128d ix1,iy1,iz1,fix1,fiy1,fiz1,iq1,isai1;
771 __m128d ix2,iy2,iz2,fix2,fiy2,fiz2,iq2,isai2;
773 __m128d ix3,iy3,iz3,fix3,fiy3,fiz3,iq3,isai3;
774 int vdwjidx0A,vdwjidx0B;
775 __m128d jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
776 __m128d dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
777 __m128d dx10,dy10,dz10,rsq10,rinv10,rinvsq10,r10,qq10,c6_10,c12_10;
778 __m128d dx20,dy20,dz20,rsq20,rinv20,rinvsq20,r20,qq20,c6_20,c12_20;
779 __m128d dx30,dy30,dz30,rsq30,rinv30,rinvsq30,r30,qq30,c6_30,c12_30;
780 __m128d velec,felec,velecsum,facel,crf,krf,krf2;
783 __m128d rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
786 __m128d one_sixth = _mm_set1_pd(1.0/6.0);
787 __m128d one_twelfth = _mm_set1_pd(1.0/12.0);
789 __m128d ewtabscale,eweps,twoeweps,sh_ewald,ewrt,ewtabhalfspace,ewtabF,ewtabFn,ewtabD,ewtabV;
791 __m128d dummy_mask,cutoff_mask;
792 __m128d signbit = gmx_mm_castsi128_pd( _mm_set_epi32(0x80000000,0x00000000,0x80000000,0x00000000) );
793 __m128d one = _mm_set1_pd(1.0);
794 __m128d two = _mm_set1_pd(2.0);
800 jindex = nlist->jindex;
802 shiftidx = nlist->shift;
804 shiftvec = fr->shift_vec[0];
805 fshift = fr->fshift[0];
806 facel = _mm_set1_pd(fr->epsfac);
807 charge = mdatoms->chargeA;
808 nvdwtype = fr->ntype;
810 vdwtype = mdatoms->typeA;
812 sh_ewald = _mm_set1_pd(fr->ic->sh_ewald);
813 ewtab = fr->ic->tabq_coul_F;
814 ewtabscale = _mm_set1_pd(fr->ic->tabq_scale);
815 ewtabhalfspace = _mm_set1_pd(0.5/fr->ic->tabq_scale);
817 /* Setup water-specific parameters */
818 inr = nlist->iinr[0];
819 iq1 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+1]));
820 iq2 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+2]));
821 iq3 = _mm_mul_pd(facel,_mm_set1_pd(charge[inr+3]));
822 vdwioffset0 = 2*nvdwtype*vdwtype[inr+0];
824 /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
825 rcutoff_scalar = fr->rcoulomb;
826 rcutoff = _mm_set1_pd(rcutoff_scalar);
827 rcutoff2 = _mm_mul_pd(rcutoff,rcutoff);
829 sh_vdw_invrcut6 = _mm_set1_pd(fr->ic->sh_invrc6);
830 rvdw = _mm_set1_pd(fr->rvdw);
832 /* Avoid stupid compiler warnings */
840 /* Start outer loop over neighborlists */
841 for(iidx=0; iidx<nri; iidx++)
843 /* Load shift vector for this list */
844 i_shift_offset = DIM*shiftidx[iidx];
846 /* Load limits for loop over neighbors */
847 j_index_start = jindex[iidx];
848 j_index_end = jindex[iidx+1];
850 /* Get outer coordinate index */
852 i_coord_offset = DIM*inr;
854 /* Load i particle coords and add shift vector */
855 gmx_mm_load_shift_and_4rvec_broadcast_pd(shiftvec+i_shift_offset,x+i_coord_offset,
856 &ix0,&iy0,&iz0,&ix1,&iy1,&iz1,&ix2,&iy2,&iz2,&ix3,&iy3,&iz3);
858 fix0 = _mm_setzero_pd();
859 fiy0 = _mm_setzero_pd();
860 fiz0 = _mm_setzero_pd();
861 fix1 = _mm_setzero_pd();
862 fiy1 = _mm_setzero_pd();
863 fiz1 = _mm_setzero_pd();
864 fix2 = _mm_setzero_pd();
865 fiy2 = _mm_setzero_pd();
866 fiz2 = _mm_setzero_pd();
867 fix3 = _mm_setzero_pd();
868 fiy3 = _mm_setzero_pd();
869 fiz3 = _mm_setzero_pd();
871 /* Start inner kernel loop */
872 for(jidx=j_index_start; jidx<j_index_end-1; jidx+=2)
875 /* Get j neighbor index, and coordinate index */
878 j_coord_offsetA = DIM*jnrA;
879 j_coord_offsetB = DIM*jnrB;
881 /* load j atom coordinates */
882 gmx_mm_load_1rvec_2ptr_swizzle_pd(x+j_coord_offsetA,x+j_coord_offsetB,
885 /* Calculate displacement vector */
886 dx00 = _mm_sub_pd(ix0,jx0);
887 dy00 = _mm_sub_pd(iy0,jy0);
888 dz00 = _mm_sub_pd(iz0,jz0);
889 dx10 = _mm_sub_pd(ix1,jx0);
890 dy10 = _mm_sub_pd(iy1,jy0);
891 dz10 = _mm_sub_pd(iz1,jz0);
892 dx20 = _mm_sub_pd(ix2,jx0);
893 dy20 = _mm_sub_pd(iy2,jy0);
894 dz20 = _mm_sub_pd(iz2,jz0);
895 dx30 = _mm_sub_pd(ix3,jx0);
896 dy30 = _mm_sub_pd(iy3,jy0);
897 dz30 = _mm_sub_pd(iz3,jz0);
899 /* Calculate squared distance and things based on it */
900 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
901 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
902 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
903 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
905 rinv10 = gmx_mm_invsqrt_pd(rsq10);
906 rinv20 = gmx_mm_invsqrt_pd(rsq20);
907 rinv30 = gmx_mm_invsqrt_pd(rsq30);
909 rinvsq00 = gmx_mm_inv_pd(rsq00);
910 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
911 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
912 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
914 /* Load parameters for j particles */
915 jq0 = gmx_mm_load_2real_swizzle_pd(charge+jnrA+0,charge+jnrB+0);
916 vdwjidx0A = 2*vdwtype[jnrA+0];
917 vdwjidx0B = 2*vdwtype[jnrB+0];
919 fjx0 = _mm_setzero_pd();
920 fjy0 = _mm_setzero_pd();
921 fjz0 = _mm_setzero_pd();
923 /**************************
924 * CALCULATE INTERACTIONS *
925 **************************/
927 if (gmx_mm_any_lt(rsq00,rcutoff2))
930 /* Compute parameters for interactions between i and j atoms */
931 gmx_mm_load_2pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,
932 vdwparam+vdwioffset0+vdwjidx0B,&c6_00,&c12_00);
934 /* LENNARD-JONES DISPERSION/REPULSION */
936 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
937 fvdw = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
939 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
943 fscal = _mm_and_pd(fscal,cutoff_mask);
945 /* Update vectorial force */
946 fix0 = _mm_macc_pd(dx00,fscal,fix0);
947 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
948 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
950 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
951 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
952 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
956 /**************************
957 * CALCULATE INTERACTIONS *
958 **************************/
960 if (gmx_mm_any_lt(rsq10,rcutoff2))
963 r10 = _mm_mul_pd(rsq10,rinv10);
965 /* Compute parameters for interactions between i and j atoms */
966 qq10 = _mm_mul_pd(iq1,jq0);
968 /* EWALD ELECTROSTATICS */
970 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
971 ewrt = _mm_mul_pd(r10,ewtabscale);
972 ewitab = _mm_cvttpd_epi32(ewrt);
974 eweps = _mm_frcz_pd(ewrt);
976 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
978 twoeweps = _mm_add_pd(eweps,eweps);
979 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
981 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
982 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
984 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
988 fscal = _mm_and_pd(fscal,cutoff_mask);
990 /* Update vectorial force */
991 fix1 = _mm_macc_pd(dx10,fscal,fix1);
992 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
993 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
995 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
996 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
997 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1001 /**************************
1002 * CALCULATE INTERACTIONS *
1003 **************************/
1005 if (gmx_mm_any_lt(rsq20,rcutoff2))
1008 r20 = _mm_mul_pd(rsq20,rinv20);
1010 /* Compute parameters for interactions between i and j atoms */
1011 qq20 = _mm_mul_pd(iq2,jq0);
1013 /* EWALD ELECTROSTATICS */
1015 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1016 ewrt = _mm_mul_pd(r20,ewtabscale);
1017 ewitab = _mm_cvttpd_epi32(ewrt);
1019 eweps = _mm_frcz_pd(ewrt);
1021 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1023 twoeweps = _mm_add_pd(eweps,eweps);
1024 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1026 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1027 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1029 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1033 fscal = _mm_and_pd(fscal,cutoff_mask);
1035 /* Update vectorial force */
1036 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1037 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1038 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1040 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1041 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1042 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1046 /**************************
1047 * CALCULATE INTERACTIONS *
1048 **************************/
1050 if (gmx_mm_any_lt(rsq30,rcutoff2))
1053 r30 = _mm_mul_pd(rsq30,rinv30);
1055 /* Compute parameters for interactions between i and j atoms */
1056 qq30 = _mm_mul_pd(iq3,jq0);
1058 /* EWALD ELECTROSTATICS */
1060 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1061 ewrt = _mm_mul_pd(r30,ewtabscale);
1062 ewitab = _mm_cvttpd_epi32(ewrt);
1064 eweps = _mm_frcz_pd(ewrt);
1066 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1068 twoeweps = _mm_add_pd(eweps,eweps);
1069 gmx_mm_load_2pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),ewtab+_mm_extract_epi32(ewitab,1),
1071 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1072 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1074 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1078 fscal = _mm_and_pd(fscal,cutoff_mask);
1080 /* Update vectorial force */
1081 fix3 = _mm_macc_pd(dx30,fscal,fix3);
1082 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
1083 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
1085 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
1086 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
1087 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
1091 gmx_mm_decrement_1rvec_2ptr_swizzle_pd(f+j_coord_offsetA,f+j_coord_offsetB,fjx0,fjy0,fjz0);
1093 /* Inner loop uses 162 flops */
1096 if(jidx<j_index_end)
1100 j_coord_offsetA = DIM*jnrA;
1102 /* load j atom coordinates */
1103 gmx_mm_load_1rvec_1ptr_swizzle_pd(x+j_coord_offsetA,
1106 /* Calculate displacement vector */
1107 dx00 = _mm_sub_pd(ix0,jx0);
1108 dy00 = _mm_sub_pd(iy0,jy0);
1109 dz00 = _mm_sub_pd(iz0,jz0);
1110 dx10 = _mm_sub_pd(ix1,jx0);
1111 dy10 = _mm_sub_pd(iy1,jy0);
1112 dz10 = _mm_sub_pd(iz1,jz0);
1113 dx20 = _mm_sub_pd(ix2,jx0);
1114 dy20 = _mm_sub_pd(iy2,jy0);
1115 dz20 = _mm_sub_pd(iz2,jz0);
1116 dx30 = _mm_sub_pd(ix3,jx0);
1117 dy30 = _mm_sub_pd(iy3,jy0);
1118 dz30 = _mm_sub_pd(iz3,jz0);
1120 /* Calculate squared distance and things based on it */
1121 rsq00 = gmx_mm_calc_rsq_pd(dx00,dy00,dz00);
1122 rsq10 = gmx_mm_calc_rsq_pd(dx10,dy10,dz10);
1123 rsq20 = gmx_mm_calc_rsq_pd(dx20,dy20,dz20);
1124 rsq30 = gmx_mm_calc_rsq_pd(dx30,dy30,dz30);
1126 rinv10 = gmx_mm_invsqrt_pd(rsq10);
1127 rinv20 = gmx_mm_invsqrt_pd(rsq20);
1128 rinv30 = gmx_mm_invsqrt_pd(rsq30);
1130 rinvsq00 = gmx_mm_inv_pd(rsq00);
1131 rinvsq10 = _mm_mul_pd(rinv10,rinv10);
1132 rinvsq20 = _mm_mul_pd(rinv20,rinv20);
1133 rinvsq30 = _mm_mul_pd(rinv30,rinv30);
1135 /* Load parameters for j particles */
1136 jq0 = _mm_load_sd(charge+jnrA+0);
1137 vdwjidx0A = 2*vdwtype[jnrA+0];
1139 fjx0 = _mm_setzero_pd();
1140 fjy0 = _mm_setzero_pd();
1141 fjz0 = _mm_setzero_pd();
1143 /**************************
1144 * CALCULATE INTERACTIONS *
1145 **************************/
1147 if (gmx_mm_any_lt(rsq00,rcutoff2))
1150 /* Compute parameters for interactions between i and j atoms */
1151 gmx_mm_load_1pair_swizzle_pd(vdwparam+vdwioffset0+vdwjidx0A,&c6_00,&c12_00);
1153 /* LENNARD-JONES DISPERSION/REPULSION */
1155 rinvsix = _mm_mul_pd(_mm_mul_pd(rinvsq00,rinvsq00),rinvsq00);
1156 fvdw = _mm_mul_pd(_mm_msub_pd(c12_00,rinvsix,c6_00),_mm_mul_pd(rinvsix,rinvsq00));
1158 cutoff_mask = _mm_cmplt_pd(rsq00,rcutoff2);
1162 fscal = _mm_and_pd(fscal,cutoff_mask);
1164 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1166 /* Update vectorial force */
1167 fix0 = _mm_macc_pd(dx00,fscal,fix0);
1168 fiy0 = _mm_macc_pd(dy00,fscal,fiy0);
1169 fiz0 = _mm_macc_pd(dz00,fscal,fiz0);
1171 fjx0 = _mm_macc_pd(dx00,fscal,fjx0);
1172 fjy0 = _mm_macc_pd(dy00,fscal,fjy0);
1173 fjz0 = _mm_macc_pd(dz00,fscal,fjz0);
1177 /**************************
1178 * CALCULATE INTERACTIONS *
1179 **************************/
1181 if (gmx_mm_any_lt(rsq10,rcutoff2))
1184 r10 = _mm_mul_pd(rsq10,rinv10);
1186 /* Compute parameters for interactions between i and j atoms */
1187 qq10 = _mm_mul_pd(iq1,jq0);
1189 /* EWALD ELECTROSTATICS */
1191 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1192 ewrt = _mm_mul_pd(r10,ewtabscale);
1193 ewitab = _mm_cvttpd_epi32(ewrt);
1195 eweps = _mm_frcz_pd(ewrt);
1197 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1199 twoeweps = _mm_add_pd(eweps,eweps);
1200 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1201 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1202 felec = _mm_mul_pd(_mm_mul_pd(qq10,rinv10),_mm_sub_pd(rinvsq10,felec));
1204 cutoff_mask = _mm_cmplt_pd(rsq10,rcutoff2);
1208 fscal = _mm_and_pd(fscal,cutoff_mask);
1210 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1212 /* Update vectorial force */
1213 fix1 = _mm_macc_pd(dx10,fscal,fix1);
1214 fiy1 = _mm_macc_pd(dy10,fscal,fiy1);
1215 fiz1 = _mm_macc_pd(dz10,fscal,fiz1);
1217 fjx0 = _mm_macc_pd(dx10,fscal,fjx0);
1218 fjy0 = _mm_macc_pd(dy10,fscal,fjy0);
1219 fjz0 = _mm_macc_pd(dz10,fscal,fjz0);
1223 /**************************
1224 * CALCULATE INTERACTIONS *
1225 **************************/
1227 if (gmx_mm_any_lt(rsq20,rcutoff2))
1230 r20 = _mm_mul_pd(rsq20,rinv20);
1232 /* Compute parameters for interactions between i and j atoms */
1233 qq20 = _mm_mul_pd(iq2,jq0);
1235 /* EWALD ELECTROSTATICS */
1237 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1238 ewrt = _mm_mul_pd(r20,ewtabscale);
1239 ewitab = _mm_cvttpd_epi32(ewrt);
1241 eweps = _mm_frcz_pd(ewrt);
1243 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1245 twoeweps = _mm_add_pd(eweps,eweps);
1246 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1247 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1248 felec = _mm_mul_pd(_mm_mul_pd(qq20,rinv20),_mm_sub_pd(rinvsq20,felec));
1250 cutoff_mask = _mm_cmplt_pd(rsq20,rcutoff2);
1254 fscal = _mm_and_pd(fscal,cutoff_mask);
1256 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1258 /* Update vectorial force */
1259 fix2 = _mm_macc_pd(dx20,fscal,fix2);
1260 fiy2 = _mm_macc_pd(dy20,fscal,fiy2);
1261 fiz2 = _mm_macc_pd(dz20,fscal,fiz2);
1263 fjx0 = _mm_macc_pd(dx20,fscal,fjx0);
1264 fjy0 = _mm_macc_pd(dy20,fscal,fjy0);
1265 fjz0 = _mm_macc_pd(dz20,fscal,fjz0);
1269 /**************************
1270 * CALCULATE INTERACTIONS *
1271 **************************/
1273 if (gmx_mm_any_lt(rsq30,rcutoff2))
1276 r30 = _mm_mul_pd(rsq30,rinv30);
1278 /* Compute parameters for interactions between i and j atoms */
1279 qq30 = _mm_mul_pd(iq3,jq0);
1281 /* EWALD ELECTROSTATICS */
1283 /* Calculate Ewald table index by multiplying r with scale and truncate to integer */
1284 ewrt = _mm_mul_pd(r30,ewtabscale);
1285 ewitab = _mm_cvttpd_epi32(ewrt);
1287 eweps = _mm_frcz_pd(ewrt);
1289 eweps = _mm_sub_pd(ewrt,_mm_round_pd(ewrt, _MM_FROUND_FLOOR));
1291 twoeweps = _mm_add_pd(eweps,eweps);
1292 gmx_mm_load_1pair_swizzle_pd(ewtab+_mm_extract_epi32(ewitab,0),&ewtabF,&ewtabFn);
1293 felec = _mm_macc_pd(eweps,ewtabFn,_mm_mul_pd( _mm_sub_pd(one,eweps),ewtabF));
1294 felec = _mm_mul_pd(_mm_mul_pd(qq30,rinv30),_mm_sub_pd(rinvsq30,felec));
1296 cutoff_mask = _mm_cmplt_pd(rsq30,rcutoff2);
1300 fscal = _mm_and_pd(fscal,cutoff_mask);
1302 fscal = _mm_unpacklo_pd(fscal,_mm_setzero_pd());
1304 /* Update vectorial force */
1305 fix3 = _mm_macc_pd(dx30,fscal,fix3);
1306 fiy3 = _mm_macc_pd(dy30,fscal,fiy3);
1307 fiz3 = _mm_macc_pd(dz30,fscal,fiz3);
1309 fjx0 = _mm_macc_pd(dx30,fscal,fjx0);
1310 fjy0 = _mm_macc_pd(dy30,fscal,fjy0);
1311 fjz0 = _mm_macc_pd(dz30,fscal,fjz0);
1315 gmx_mm_decrement_1rvec_1ptr_swizzle_pd(f+j_coord_offsetA,fjx0,fjy0,fjz0);
1317 /* Inner loop uses 162 flops */
1320 /* End of innermost loop */
1322 gmx_mm_update_iforce_4atom_swizzle_pd(fix0,fiy0,fiz0,fix1,fiy1,fiz1,fix2,fiy2,fiz2,fix3,fiy3,fiz3,
1323 f+i_coord_offset,fshift+i_shift_offset);
1325 /* Increment number of inner iterations */
1326 inneriter += j_index_end - j_index_start;
1328 /* Outer loop uses 24 flops */
1331 /* Increment number of outer iterations */
1334 /* Update outer/inner flops */
1336 inc_nrnb(nrnb,eNR_NBKERNEL_ELEC_VDW_W4_F,outeriter*24 + inneriter*162);