From: Roland Schulz Date: Thu, 21 Aug 2014 16:16:09 +0000 (-0400) Subject: Minor performance improments X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?p=alexxy%2Fgromacs.git;a=commitdiff_plain;h=4cd608ba1c6b677472b3dfcb19cc5f15ebe9e639 Minor performance improments Mostly useful as lesson learned. 1) The double precision constant forces the compiler to convert the single precision variable to double, then do the multiplication in double and then convert back. Using the single precsion constant in double reduces the accuracy (the calculation is still done double but the constant has only single precision). 2) Using a temporary array instead of a temporary scalar causes ICC14 to generate an extra store. Change-Id: Ib320ac2ae4ff80ce48277544abff468c483cc83a --- diff --git a/src/gromacs/gmxlib/nonbonded/nb_free_energy.c b/src/gromacs/gmxlib/nonbonded/nb_free_energy.c index 93f1d1c361..6b5d6d89dc 100644 --- a/src/gromacs/gmxlib/nonbonded/nb_free_energy.c +++ b/src/gromacs/gmxlib/nonbonded/nb_free_energy.c @@ -127,10 +127,19 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, int ewitab; real ewrt, eweps, ewtabscale, ewtabhalfspace, sh_ewald; + const real onetwelfth = 1.0/12.0; + const real onesixth = 1.0/6.0; + const real zero = 0.0; + const real half = 0.5; + const real one = 1.0; + const real two = 2.0; + const real six = 6.0; + const real fourtyeight = 48.0; + sh_ewald = fr->ic->sh_ewald; ewtab = fr->ic->tabq_coul_FDV0; ewtabscale = fr->ic->tabq_scale; - ewtabhalfspace = 0.5/ewtabscale; + ewtabhalfspace = half/ewtabscale; tab_ewald_F_lj = fr->ic->tabq_vdw_F; tab_ewald_V_lj = fr->ic->tabq_vdw_V; @@ -295,8 +304,8 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, dvdl_vdw = 0; /* Lambda factor for state A, 1-lambda*/ - LFC[STATE_A] = 1.0 - lambda_coul; - LFV[STATE_A] = 1.0 - lambda_vdw; + LFC[STATE_A] = one - lambda_coul; + LFV[STATE_A] = one - lambda_vdw; /* Lambda factor for state B, lambda*/ LFC[STATE_B] = lambda_coul; @@ -391,12 +400,12 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, r = 0; } - if (sc_r_power == 6.0) + if (sc_r_power == six) { rpm2 = rsq*rsq; /* r4 */ rp = rpm2*rsq; /* r6 */ } - else if (sc_r_power == 48.0) + else if (sc_r_power == fourtyeight) { rp = rsq*rsq*rsq; /* r6 */ rp = rp*rp; /* r12 */ @@ -429,7 +438,7 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, if ((c6[i] > 0) && (c12[i] > 0)) { /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */ - sigma6[i] = 0.5*c12[i]/c6[i]; + sigma6[i] = half*c12[i]/c6[i]; sigma2[i] = pow(sigma6[i], 1.0/3.0); /* should be able to get rid of this ^^^ internal pow call eventually. Will require agreement on what data to store externally. Can't be fixed without larger scale changes, so not 4.6 */ @@ -444,12 +453,12 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, sigma6[i] = sigma6_def; sigma2[i] = sigma2_def; } - if (sc_r_power == 6.0) + if (sc_r_power == six) { sigma_pow[i] = sigma6[i]; sigma_powm2[i] = sigma6[i]/sigma2[i]; } - else if (sc_r_power == 48.0) + else if (sc_r_power == fourtyeight) { sigma_pow[i] = sigma6[i]*sigma6[i]; /* sigma^12 */ sigma_pow[i] = sigma_pow[i]*sigma_pow[i]; /* sigma^24 */ @@ -486,13 +495,13 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, if ( (qq[i] != 0) || (c6[i] != 0) || (c12[i] != 0) ) { /* this section has to be inside the loop because of the dependence on sigma_pow */ - rpinvC = 1.0/(alpha_coul_eff*lfac_coul[i]*sigma_pow[i]+rp); - rinvC = pow(rpinvC, 1.0/sc_r_power); - rC = 1.0/rinvC; + rpinvC = one/(alpha_coul_eff*lfac_coul[i]*sigma_pow[i]+rp); + rinvC = pow(rpinvC, one/sc_r_power); + rC = one/rinvC; - rpinvV = 1.0/(alpha_vdw_eff*lfac_vdw[i]*sigma_pow[i]+rp); - rinvV = pow(rpinvV, 1.0/sc_r_power); - rV = 1.0/rinvV; + rpinvV = one/(alpha_vdw_eff*lfac_vdw[i]*sigma_pow[i]+rp); + rinvV = pow(rpinvV, one/sc_r_power); + rV = one/rinvV; if (do_tab) { @@ -534,7 +543,7 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, case GMX_NBKERNEL_ELEC_REACTIONFIELD: /* reaction-field */ Vcoul[i] = qq[i]*(rinvC + krf*rC*rC-crf); - FscalC[i] = qq[i]*(rinvC - 2.0*krf*rC*rC); + FscalC[i] = qq[i]*(rinvC - two*krf*rC*rC); break; case GMX_NBKERNEL_ELEC_CUBICSPLINETABLE: @@ -546,7 +555,7 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, Heps2 = eps2C*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+epsC*Fp; - FF = Fp+Geps+2.0*Heps2; + FF = Fp+Geps+two*Heps2; Vcoul[i] = qq[i]*VV; FscalC[i] = -qq[i]*tabscale*FF*rC; break; @@ -576,8 +585,8 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, break; case GMX_NBKERNEL_ELEC_NONE: - FscalC[i] = 0.0; - Vcoul[i] = 0.0; + FscalC[i] = zero; + Vcoul[i] = zero; break; default: @@ -588,16 +597,16 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, if (fr->coulomb_modifier == eintmodPOTSWITCH) { d = rC-fr->rcoulomb_switch; - d = (d > 0.0) ? d : 0.0; + d = (d > zero) ? d : zero; d2 = d*d; - sw = 1.0+d2*d*(elec_swV3+d*(elec_swV4+d*elec_swV5)); + sw = one+d2*d*(elec_swV3+d*(elec_swV4+d*elec_swV5)); dsw = d2*(elec_swF2+d*(elec_swF3+d*elec_swF4)); FscalC[i] = FscalC[i]*sw - rC*Vcoul[i]*dsw; Vcoul[i] *= sw; - FscalC[i] = (rC < rcoulomb) ? FscalC[i] : 0.0; - Vcoul[i] = (rC < rcoulomb) ? Vcoul[i] : 0.0; + FscalC[i] = (rC < rcoulomb) ? FscalC[i] : zero; + Vcoul[i] = (rC < rcoulomb) ? Vcoul[i] : zero; } } @@ -615,7 +624,7 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, { case GMX_NBKERNEL_VDW_LENNARDJONES: /* cutoff LJ */ - if (sc_r_power == 6.0) + if (sc_r_power == six) { rinv6 = rpinvV; } @@ -627,8 +636,8 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, Vvdw6 = c6[i]*rinv6; Vvdw12 = c12[i]*rinv6*rinv6; - Vvdw[i] = ( (Vvdw12 - c12[i]*sh_invrc6*sh_invrc6)*(1.0/12.0) - - (Vvdw6 - c6[i]*sh_invrc6)*(1.0/6.0)); + Vvdw[i] = ( (Vvdw12 - c12[i]*sh_invrc6*sh_invrc6)*onetwelfth + - (Vvdw6 - c6[i]*sh_invrc6)*onesixth); FscalV[i] = Vvdw12 - Vvdw6; break; @@ -646,7 +655,7 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, Heps2 = eps2V*VFtab[nnn+3]; Fp = F+Geps+Heps2; VV = Y+epsV*Fp; - FF = Fp+Geps+2.0*Heps2; + FF = Fp+Geps+two*Heps2; Vvdw[i] += c6[i]*VV; FscalV[i] -= c6[i]*tabscale*FF*rV; @@ -657,13 +666,13 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, Heps2 = eps2V*VFtab[nnn+7]; Fp = F+Geps+Heps2; VV = Y+epsV*Fp; - FF = Fp+Geps+2.0*Heps2; + FF = Fp+Geps+two*Heps2; Vvdw[i] += c12[i]*VV; FscalV[i] -= c12[i]*tabscale*FF*rV; break; case GMX_NBKERNEL_VDW_LJEWALD: - if (sc_r_power == 6.0) + if (sc_r_power == six) { rinv6 = rpinvV; } @@ -680,8 +689,8 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, Vvdw6 = c6[i]*rinv6; Vvdw12 = c12[i]*rinv6*rinv6; - Vvdw[i] = ( (Vvdw12 - c12[i]*sh_invrc6*sh_invrc6)*(1.0/12.0) - - (Vvdw6 - c6[i]*sh_invrc6 - c6grid*sh_lj_ewald)*(1.0/6.0)); + Vvdw[i] = ( (Vvdw12 - c12[i]*sh_invrc6*sh_invrc6)*onetwelfth + - (Vvdw6 - c6[i]*sh_invrc6 - c6grid*sh_lj_ewald)*onesixth); FscalV[i] = Vvdw12 - Vvdw6; } else @@ -689,17 +698,17 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, /* Normal LJ-PME */ ewcljrsq = ewclj2*rV*rV; exponent = exp(-ewcljrsq); - poly = exponent*(1.0 + ewcljrsq + ewcljrsq*ewcljrsq*0.5); - vvdw_disp = (c6[i]-c6grid*(1.0-poly))*rinv6; + poly = exponent*(one + ewcljrsq + ewcljrsq*ewcljrsq*half); + vvdw_disp = (c6[i]-c6grid*(one-poly))*rinv6; vvdw_rep = c12[i]*rinv6*rinv6; - FscalV[i] = vvdw_rep - vvdw_disp - c6grid*(1.0/6.0)*exponent*ewclj6; - Vvdw[i] = (vvdw_rep - c12[i]*sh_invrc6*sh_invrc6)/12.0 - (vvdw_disp - c6[i]*sh_invrc6 - c6grid*sh_lj_ewald)/6.0; + FscalV[i] = vvdw_rep - vvdw_disp - c6grid*onesixth*exponent*ewclj6; + Vvdw[i] = (vvdw_rep - c12[i]*sh_invrc6*sh_invrc6)*onetwelfth - (vvdw_disp - c6[i]*sh_invrc6 - c6grid*sh_lj_ewald)/six; } break; case GMX_NBKERNEL_VDW_NONE: - Vvdw[i] = 0.0; - FscalV[i] = 0.0; + Vvdw[i] = zero; + FscalV[i] = zero; break; default: @@ -710,16 +719,16 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, if (fr->vdw_modifier == eintmodPOTSWITCH) { d = rV-fr->rvdw_switch; - d = (d > 0.0) ? d : 0.0; + d = (d > zero) ? d : zero; d2 = d*d; - sw = 1.0+d2*d*(vdw_swV3+d*(vdw_swV4+d*vdw_swV5)); + sw = one+d2*d*(vdw_swV3+d*(vdw_swV4+d*vdw_swV5)); dsw = d2*(vdw_swF2+d*(vdw_swF3+d*vdw_swF4)); FscalV[i] = FscalV[i]*sw - rV*Vvdw[i]*dsw; Vvdw[i] *= sw; - FscalV[i] = (rV < rvdw) ? FscalV[i] : 0.0; - Vvdw[i] = (rV < rvdw) ? Vvdw[i] : 0.0; + FscalV[i] = (rV < rvdw) ? FscalV[i] : zero; + Vvdw[i] = (rV < rvdw) ? Vvdw[i] : zero; } } @@ -754,11 +763,11 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, * As there is no singularity, there is no need for soft-core. */ VV = krf*rsq - crf; - FF = -2.0*krf; + FF = -two*krf; if (ii == jnr) { - VV *= 0.5; + VV *= half; } for (i = 0; i < NSTATES; i++) @@ -800,7 +809,7 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, * scheme, and corresponds to a self-interaction that will * occur twice. Scale it down by 50% to only include it once. */ - v_lr *= 0.5; + v_lr *= half; } for (i = 0; i < NSTATES; i++) @@ -835,8 +844,8 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, /* TODO: Currently the Ewald LJ table does not contain * the factor 1/6, we should add this. */ - FF = f_lr*rinv/6.0; - VV = (tab_ewald_V_lj[ri] - ewtabhalfspace*frac*(tab_ewald_F_lj[ri] + f_lr))/6.0; + FF = f_lr*rinv/six; + VV = (tab_ewald_V_lj[ri] - ewtabhalfspace*frac*(tab_ewald_F_lj[ri] + f_lr))/six; if (ii == jnr) { @@ -845,7 +854,7 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, * scheme, and corresponds to a self-interaction that will * occur twice. Scale it down by 50% to only include it once. */ - VV *= 0.5; + VV *= half; } for (i = 0; i < NSTATES; i++) @@ -944,6 +953,12 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a real velec[2], vvdw[2]; int i, ntab; + const real half = 0.5; + const real one = 1.0; + const real two = 2.0; + const real six = 6.0; + const real fourtyeight = 48.0; + qq[0] = qqA; qq[1] = qqB; c6[0] = c6A; @@ -951,12 +966,12 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a c12[0] = c12A; c12[1] = c12B; - if (sc_r_power == 6.0) + if (sc_r_power == six) { rpm2 = r2*r2; /* r4 */ rp = rpm2*r2; /* r6 */ } - else if (sc_r_power == 48.0) + else if (sc_r_power == fourtyeight) { rp = r2*r2*r2; /* r6 */ rp = rp*rp; /* r12 */ @@ -966,7 +981,7 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a } else { - rp = pow(r2, 0.5*sc_r_power); /* not currently supported as input, but can handle it */ + rp = pow(r2, half*sc_r_power); /* not currently supported as input, but can handle it */ rpm2 = rp/r2; } @@ -978,8 +993,8 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a /* The c6 & c12 coefficients now contain the constants 6.0 and 12.0, respectively. * Correct for this by multiplying with (1/12.0)/(1/6.0)=6.0/12.0=0.5. */ - sigma6[i] = 0.5*c12[i]/c6[i]; - sigma2[i] = pow(0.5*c12[i]/c6[i], 1.0/3.0); + sigma6[i] = half*c12[i]/c6[i]; + sigma2[i] = pow(half*c12[i]/c6[i], 1.0/3.0); /* should be able to get rid of this ^^^ internal pow call eventually. Will require agreement on what data to store externally. Can't be fixed without larger scale changes, so not 5.0 */ if (sigma6[i] < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */ @@ -993,12 +1008,12 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a sigma6[i] = sigma6_def; sigma2[i] = sigma2_def; } - if (sc_r_power == 6.0) + if (sc_r_power == six) { sigma_pow[i] = sigma6[i]; sigma_powm2[i] = sigma6[i]/sigma2[i]; } - else if (sc_r_power == 48.0) + else if (sc_r_power == fourtyeight) { sigma_pow[i] = sigma6[i]*sigma6[i]; /* sigma^12 */ sigma_pow[i] = sigma_pow[i]*sigma_pow[i]; /* sigma^24 */ @@ -1036,8 +1051,8 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a if ( (qq[i] != 0) || (c6[i] != 0) || (c12[i] != 0) ) { /* Coulomb */ - rpinv = 1.0/(alpha_coul_eff*lfac_coul[i]*sigma_pow[i]+rp); - r_coul = pow(rpinv, -1.0/sc_r_power); + rpinv = one/(alpha_coul_eff*lfac_coul[i]*sigma_pow[i]+rp); + r_coul = pow(rpinv, -one/sc_r_power); /* Electrostatics table lookup data */ rtab = r_coul*tabscale; @@ -1052,13 +1067,13 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a Heps2 = eps2*vftab[ntab+3]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; - FF = Fp+Geps+2.0*Heps2; + FF = Fp+Geps+two*Heps2; velec[i] = qq[i]*VV; fscal_elec[i] = -qq[i]*FF*r_coul*rpinv*tabscale; /* Vdw */ - rpinv = 1.0/(alpha_vdw_eff*lfac_vdw[i]*sigma_pow[i]+rp); - r_vdw = pow(rpinv, -1.0/sc_r_power); + rpinv = one/(alpha_vdw_eff*lfac_vdw[i]*sigma_pow[i]+rp); + r_vdw = pow(rpinv, -one/sc_r_power); /* Vdw table lookup data */ rtab = r_vdw*tabscale; ntab = rtab; @@ -1072,7 +1087,7 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a Heps2 = eps2*vftab[ntab+7]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; - FF = Fp+Geps+2.0*Heps2; + FF = Fp+Geps+two*Heps2; vvdw[i] = c6[i]*VV; fscal_vdw[i] = -c6[i]*FF; @@ -1083,7 +1098,7 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a Heps2 = eps2*vftab[ntab+11]; Fp = F+Geps+Heps2; VV = Y+eps*Fp; - FF = Fp+Geps+2.0*Heps2; + FF = Fp+Geps+two*Heps2; vvdw[i] += c12[i]*VV; fscal_vdw[i] -= c12[i]*FF; fscal_vdw[i] *= r_vdw*rpinv*tabscale; diff --git a/src/gromacs/mdlib/domdec_setup.cpp b/src/gromacs/mdlib/domdec_setup.cpp index a802f29fb9..0660a849cd 100644 --- a/src/gromacs/mdlib/domdec_setup.cpp +++ b/src/gromacs/mdlib/domdec_setup.cpp @@ -249,13 +249,13 @@ static int div_up(int n, int f) real comm_box_frac(ivec dd_nc, real cutoff, gmx_ddbox_t *ddbox) { int i, j, k; - rvec bt, nw; + rvec nw; real comm_vol; for (i = 0; i < DIM; i++) { - bt[i] = ddbox->box_size[i]*ddbox->skew_fac[i]; - nw[i] = dd_nc[i]*cutoff/bt[i]; + real bt = ddbox->box_size[i]*ddbox->skew_fac[i]; + nw[i] = dd_nc[i]*cutoff/bt; } comm_vol = 0;