From: Berk Hess Date: Tue, 26 Nov 2013 07:52:16 +0000 (+0100) Subject: Free-energy now works with the Verlet scheme X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=99aa704d61062b0ef46cf00a7655ab759651e2ea;p=alexxy%2Fgromacs.git Free-energy now works with the Verlet scheme Implemented perturbed non-bonded interactions with the Verlet by masking them from the nbnxn pair lists and storing them in an old-fashioned group neighbor list. This way we keep a single free energy kernel. The free energy kernel uses OpenMP atomic for reductions. Also fixed grompp setting nstcalcenergy=0 with nstenergy=0. Change-Id: I5a6a7e84b46e06250d141e2e08cb3a11077cddab --- diff --git a/src/gromacs/gmxlib/nonbonded/nb_free_energy.c b/src/gromacs/gmxlib/nonbonded/nb_free_energy.c index 491b128c34..cb6f5c26a8 100644 --- a/src/gromacs/gmxlib/nonbonded/nb_free_energy.c +++ b/src/gromacs/gmxlib/nonbonded/nb_free_energy.c @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2013, by the GROMACS development team, led by + * Copyright (c) 2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -45,6 +45,7 @@ #include "nonbonded.h" #include "nb_kernel.h" #include "nrnb.h" +#include "macros.h" #include "nb_free_energy.h" void @@ -94,8 +95,8 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, const real * shiftvec; real dvdl_part; real * fshift; - real tabscale; - const real * VFtab; + real tabscale = 0; + const real * VFtab = NULL; const real * x; real * f; real facel, krf, crf; @@ -107,9 +108,11 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, real * dvdl; real * Vv; real * Vc; - gmx_bool bDoForces; - real rcoulomb, rvdw, sh_invrc6; - gmx_bool bExactElecCutoff, bExactVdwCutoff; + gmx_bool bDoForces, bDoShiftForces, bDoPotential; + real rcoulomb, sh_ewald; + real rvdw, sh_invrc6; + gmx_bool bExactElecCutoff, bExactVdwCutoff, bExactCutoffAll, bEwald; + real rcutoff_max2; real rcutoff, rcutoff2, rswitch, d, d2, swV3, swV4, swV5, swF2, swF3, swF4, sw, dsw, rinvcorr; const real * tab_ewald_F; const real * tab_ewald_V; @@ -142,8 +145,6 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, ntype = fr->ntype; nbfp = fr->nbfp; Vv = kernel_data->energygrp_vdw; - tabscale = kernel_data->table_elec_vdw->scale; - VFtab = kernel_data->table_elec_vdw->data; lambda_coul = kernel_data->lambda[efptCOUL]; lambda_vdw = kernel_data->lambda[efptVDW]; dvdl = kernel_data->dvdl; @@ -154,8 +155,11 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, sigma6_def = fr->sc_sigma6_def; sigma6_min = fr->sc_sigma6_min; bDoForces = kernel_data->flags & GMX_NONBONDED_DO_FORCE; + bDoShiftForces = kernel_data->flags & GMX_NONBONDED_DO_SHIFTFORCE; + bDoPotential = kernel_data->flags & GMX_NONBONDED_DO_POTENTIAL; rcoulomb = fr->rcoulomb; + sh_ewald = fr->ic->sh_ewald; rvdw = fr->rvdw; sh_invrc6 = fr->ic->sh_invrc6; @@ -190,8 +194,41 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, swF4 = 0.0; } - bExactElecCutoff = (fr->coulomb_modifier != eintmodNONE) || fr->eeltype == eelRF_ZERO; - bExactVdwCutoff = (fr->vdw_modifier != eintmodNONE); + if (fr->cutoff_scheme == ecutsVERLET) + { + const interaction_const_t *ic; + + ic = fr->ic; + + ivdw = GMX_NBKERNEL_VDW_LENNARDJONES; + + if (ic->eeltype == eelCUT || EEL_RF(ic->eeltype)) + { + icoul = GMX_NBKERNEL_ELEC_REACTIONFIELD; + } + else if (EEL_PME_EWALD(ic->eeltype)) + { + icoul = GMX_NBKERNEL_ELEC_EWALD; + } + else + { + gmx_incons("Unsupported eeltype with Verlet and free-energy"); + } + + bExactElecCutoff = TRUE; + bExactVdwCutoff = TRUE; + } + else + { + bExactElecCutoff = (fr->coulomb_modifier != eintmodNONE) || fr->eeltype == eelRF_ZERO; + bExactVdwCutoff = (fr->vdw_modifier != eintmodNONE); + } + + bExactCutoffAll = (bExactElecCutoff && bExactVdwCutoff); + rcutoff_max2 = max(fr->rcoulomb, fr->rvdw); + rcutoff_max2 = rcutoff_max2*rcutoff_max2; + + bEwald = (icoul == GMX_NBKERNEL_ELEC_EWALD); /* fix compiler warnings */ nj1 = 0; @@ -229,12 +266,20 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, do_tab = (icoul == GMX_NBKERNEL_ELEC_CUBICSPLINETABLE || ivdw == GMX_NBKERNEL_VDW_CUBICSPLINETABLE); - - /* we always use the combined table here */ - tab_elemsize = 12; + if (do_tab) + { + tabscale = kernel_data->table_elec_vdw->scale; + VFtab = kernel_data->table_elec_vdw->data; + /* we always use the combined table here */ + tab_elemsize = 12; + } for (n = 0; (n < nri); n++) { + int npair_within_cutoff; + + npair_within_cutoff = 0; + is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; @@ -263,9 +308,34 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, dx = ix - x[j3]; dy = iy - x[j3+1]; dz = iz - x[j3+2]; - rsq = dx*dx+dy*dy+dz*dz; - rinv = gmx_invsqrt(rsq); - r = rsq*rinv; + rsq = dx*dx + dy*dy + dz*dz; + + if (bExactCutoffAll && rsq >= rcutoff_max2) + { + /* We save significant time by skipping all code below. + * Note that with soft-core interactions, the actual cut-off + * check might be different. But since the soft-core distance + * is always larger than r, checking on r here is safe. + */ + continue; + } + npair_within_cutoff++; + + if (rsq > 0) + { + rinv = gmx_invsqrt(rsq); + r = rsq*rinv; + } + else + { + /* The force at r=0 is zero, because of symmetry. + * But note that the potential is in general non-zero, + * since the soft-cored r will be non-zero. + */ + rinv = 0; + r = 0; + } + if (sc_r_power == 6.0) { rpm2 = rsq*rsq; /* r4 */ @@ -285,261 +355,304 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, rpm2 = rp/rsq; } - tj[STATE_A] = ntiA+2*typeA[jnr]; - tj[STATE_B] = ntiB+2*typeB[jnr]; + Fscal = 0; + qq[STATE_A] = iqA*chargeA[jnr]; qq[STATE_B] = iqB*chargeB[jnr]; - for (i = 0; i < NSTATES; i++) + if (nlist->excl_fep == NULL || nlist->excl_fep[k]) { + tj[STATE_A] = ntiA+2*typeA[jnr]; + tj[STATE_B] = ntiB+2*typeB[jnr]; - c6[i] = nbfp[tj[i]]; - c12[i] = nbfp[tj[i]+1]; - if ((c6[i] > 0) && (c12[i] > 0)) + for (i = 0; i < NSTATES; i++) { - /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */ - sigma6[i] = 0.5*c12[i]/c6[i]; - sigma2[i] = pow(sigma6[i], 1.0/3.0); - /* should be able to get rid of this ^^^ internal pow call eventually. Will require agreement on - what data to store externally. Can't be fixed without larger scale changes, so not 4.6 */ - if (sigma6[i] < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */ + + c6[i] = nbfp[tj[i]]; + c12[i] = nbfp[tj[i]+1]; + if ((c6[i] > 0) && (c12[i] > 0)) + { + /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */ + sigma6[i] = 0.5*c12[i]/c6[i]; + sigma2[i] = pow(sigma6[i], 1.0/3.0); + /* should be able to get rid of this ^^^ internal pow call eventually. Will require agreement on + what data to store externally. Can't be fixed without larger scale changes, so not 4.6 */ + if (sigma6[i] < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */ + { + sigma6[i] = sigma6_min; + sigma2[i] = sigma2_min; + } + } + else { - sigma6[i] = sigma6_min; - sigma2[i] = sigma2_min; + sigma6[i] = sigma6_def; + sigma2[i] = sigma2_def; + } + if (sc_r_power == 6.0) + { + sigma_pow[i] = sigma6[i]; + sigma_powm2[i] = sigma6[i]/sigma2[i]; + } + else if (sc_r_power == 48.0) + { + sigma_pow[i] = sigma6[i]*sigma6[i]; /* sigma^12 */ + sigma_pow[i] = sigma_pow[i]*sigma_pow[i]; /* sigma^24 */ + sigma_pow[i] = sigma_pow[i]*sigma_pow[i]; /* sigma^48 */ + sigma_powm2[i] = sigma_pow[i]/sigma2[i]; + } + else + { /* not really supported as input, but in here for testing the general case*/ + sigma_pow[i] = pow(sigma2[i], sc_r_power/2); + sigma_powm2[i] = sigma_pow[i]/(sigma2[i]); } } - else - { - sigma6[i] = sigma6_def; - sigma2[i] = sigma2_def; - } - if (sc_r_power == 6.0) - { - sigma_pow[i] = sigma6[i]; - sigma_powm2[i] = sigma6[i]/sigma2[i]; - } - else if (sc_r_power == 48.0) + + /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/ + if ((c12[STATE_A] > 0) && (c12[STATE_B] > 0)) { - sigma_pow[i] = sigma6[i]*sigma6[i]; /* sigma^12 */ - sigma_pow[i] = sigma_pow[i]*sigma_pow[i]; /* sigma^24 */ - sigma_pow[i] = sigma_pow[i]*sigma_pow[i]; /* sigma^48 */ - sigma_powm2[i] = sigma_pow[i]/sigma2[i]; + alpha_vdw_eff = 0; + alpha_coul_eff = 0; } else - { /* not really supported as input, but in here for testing the general case*/ - sigma_pow[i] = pow(sigma2[i], sc_r_power/2); - sigma_powm2[i] = sigma_pow[i]/(sigma2[i]); + { + alpha_vdw_eff = alpha_vdw; + alpha_coul_eff = alpha_coul; } - } - /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/ - if ((c12[STATE_A] > 0) && (c12[STATE_B] > 0)) - { - alpha_vdw_eff = 0; - alpha_coul_eff = 0; - } - else - { - alpha_vdw_eff = alpha_vdw; - alpha_coul_eff = alpha_coul; - } - - for (i = 0; i < NSTATES; i++) - { - FscalC[i] = 0; - FscalV[i] = 0; - Vcoul[i] = 0; - Vvdw[i] = 0; - - /* Only spend time on A or B state if it is non-zero */ - if ( (qq[i] != 0) || (c6[i] != 0) || (c12[i] != 0) ) + for (i = 0; i < NSTATES; i++) { + FscalC[i] = 0; + FscalV[i] = 0; + Vcoul[i] = 0; + Vvdw[i] = 0; - /* this section has to be inside the loop becaue of the dependence on sigma_pow */ - rpinvC = 1.0/(alpha_coul_eff*lfac_coul[i]*sigma_pow[i]+rp); - rinvC = pow(rpinvC, 1.0/sc_r_power); - rC = 1.0/rinvC; - - rpinvV = 1.0/(alpha_vdw_eff*lfac_vdw[i]*sigma_pow[i]+rp); - rinvV = pow(rpinvV, 1.0/sc_r_power); - rV = 1.0/rinvV; - - if (do_tab) + /* Only spend time on A or B state if it is non-zero */ + if ( (qq[i] != 0) || (c6[i] != 0) || (c12[i] != 0) ) { - rtC = rC*tabscale; - n0 = rtC; - epsC = rtC-n0; - eps2C = epsC*epsC; - n1C = tab_elemsize*n0; - - rtV = rV*tabscale; - n0 = rtV; - epsV = rtV-n0; - eps2V = epsV*epsV; - n1V = tab_elemsize*n0; - } + /* this section has to be inside the loop because of the dependence on sigma_pow */ + rpinvC = 1.0/(alpha_coul_eff*lfac_coul[i]*sigma_pow[i]+rp); + rinvC = pow(rpinvC, 1.0/sc_r_power); + rC = 1.0/rinvC; - /* With Ewald and soft-core we should put the cut-off on r, - * not on the soft-cored rC, as the real-space and - * reciprocal space contributions should (almost) cancel. - */ - if (qq[i] != 0 && - !(bExactElecCutoff && - ((icoul != GMX_NBKERNEL_ELEC_EWALD && rC >= rcoulomb) || - (icoul == GMX_NBKERNEL_ELEC_EWALD && r >= rcoulomb)))) - { - switch (icoul) + rpinvV = 1.0/(alpha_vdw_eff*lfac_vdw[i]*sigma_pow[i]+rp); + rinvV = pow(rpinvV, 1.0/sc_r_power); + rV = 1.0/rinvV; + + if (do_tab) { - case GMX_NBKERNEL_ELEC_COULOMB: - case GMX_NBKERNEL_ELEC_EWALD: - /* simple cutoff (yes, ewald is done all on direct space for free energy) */ - Vcoul[i] = qq[i]*rinvC; - FscalC[i] = Vcoul[i]; - break; - - case GMX_NBKERNEL_ELEC_REACTIONFIELD: - /* reaction-field */ - Vcoul[i] = qq[i]*(rinvC + krf*rC*rC-crf); - FscalC[i] = qq[i]*(rinvC - 2.0*krf*rC*rC); - break; - - case GMX_NBKERNEL_ELEC_CUBICSPLINETABLE: - /* non-Ewald tabulated coulomb */ - nnn = n1C; - Y = VFtab[nnn]; - F = VFtab[nnn+1]; - Geps = epsC*VFtab[nnn+2]; - Heps2 = eps2C*VFtab[nnn+3]; - Fp = F+Geps+Heps2; - VV = Y+epsC*Fp; - FF = Fp+Geps+2.0*Heps2; - Vcoul[i] = qq[i]*VV; - FscalC[i] = -qq[i]*tabscale*FF*rC; - break; - - case GMX_NBKERNEL_ELEC_GENERALIZEDBORN: - gmx_fatal(FARGS, "Free energy and GB not implemented.\n"); - break; - - case GMX_NBKERNEL_ELEC_NONE: - FscalC[i] = 0.0; - Vcoul[i] = 0.0; - break; - - default: - gmx_incons("Invalid icoul in free energy kernel"); - break; + rtC = rC*tabscale; + n0 = rtC; + epsC = rtC-n0; + eps2C = epsC*epsC; + n1C = tab_elemsize*n0; + + rtV = rV*tabscale; + n0 = rtV; + epsV = rtV-n0; + eps2V = epsV*epsV; + n1V = tab_elemsize*n0; } - if (fr->coulomb_modifier == eintmodPOTSWITCH) + /* With Ewald and soft-core we should put the cut-off on r, + * not on the soft-cored rC, as the real-space and + * reciprocal space contributions should (almost) cancel. + */ + if (qq[i] != 0 && + !(bExactElecCutoff && + ((!bEwald && rC >= rcoulomb) || + (bEwald && r >= rcoulomb)))) { - d = rC-rswitch; - d = (d > 0.0) ? d : 0.0; - d2 = d*d; - sw = 1.0+d2*d*(swV3+d*(swV4+d*swV5)); - dsw = d2*(swF2+d*(swF3+d*swF4)); - - Vcoul[i] *= sw; - FscalC[i] = FscalC[i]*sw + Vcoul[i]*dsw; + switch (icoul) + { + case GMX_NBKERNEL_ELEC_COULOMB: + /* simple cutoff */ + Vcoul[i] = qq[i]*rinvC; + FscalC[i] = Vcoul[i]; + break; + + case GMX_NBKERNEL_ELEC_EWALD: + /* Ewald FEP is done only on the 1/r part */ + Vcoul[i] = qq[i]*(rinvC - sh_ewald); + FscalC[i] = Vcoul[i]; + break; + + case GMX_NBKERNEL_ELEC_REACTIONFIELD: + /* reaction-field */ + Vcoul[i] = qq[i]*(rinvC + krf*rC*rC-crf); + FscalC[i] = qq[i]*(rinvC - 2.0*krf*rC*rC); + break; + + case GMX_NBKERNEL_ELEC_CUBICSPLINETABLE: + /* non-Ewald tabulated coulomb */ + nnn = n1C; + Y = VFtab[nnn]; + F = VFtab[nnn+1]; + Geps = epsC*VFtab[nnn+2]; + Heps2 = eps2C*VFtab[nnn+3]; + Fp = F+Geps+Heps2; + VV = Y+epsC*Fp; + FF = Fp+Geps+2.0*Heps2; + Vcoul[i] = qq[i]*VV; + FscalC[i] = -qq[i]*tabscale*FF*rC; + break; + + case GMX_NBKERNEL_ELEC_GENERALIZEDBORN: + gmx_fatal(FARGS, "Free energy and GB not implemented.\n"); + break; + + case GMX_NBKERNEL_ELEC_NONE: + FscalC[i] = 0.0; + Vcoul[i] = 0.0; + break; + + default: + gmx_incons("Invalid icoul in free energy kernel"); + break; + } + + if (fr->coulomb_modifier == eintmodPOTSWITCH) + { + d = rC-rswitch; + d = (d > 0.0) ? d : 0.0; + d2 = d*d; + sw = 1.0+d2*d*(swV3+d*(swV4+d*swV5)); + dsw = d2*(swF2+d*(swF3+d*swF4)); + + Vcoul[i] *= sw; + FscalC[i] = FscalC[i]*sw + Vcoul[i]*dsw; + } } - } - if ((c6[i] != 0 || c12[i] != 0) && - !(bExactVdwCutoff && rV >= rvdw)) - { - switch (ivdw) + if ((c6[i] != 0 || c12[i] != 0) && + !(bExactVdwCutoff && rV >= rvdw)) { - case GMX_NBKERNEL_VDW_LENNARDJONES: - /* cutoff LJ */ - if (sc_r_power == 6.0) - { - rinv6 = rpinvV; - } - else - { - rinv6 = pow(rinvV, 6.0); - } - Vvdw6 = c6[i]*rinv6; - Vvdw12 = c12[i]*rinv6*rinv6; - if (fr->vdw_modifier == eintmodPOTSHIFT) - { - Vvdw[i] = ( (Vvdw12-c12[i]*sh_invrc6*sh_invrc6)*(1.0/12.0) - -(Vvdw6-c6[i]*sh_invrc6)*(1.0/6.0)); - } - else - { - Vvdw[i] = Vvdw12*(1.0/12.0) - Vvdw6*(1.0/6.0); - } - FscalV[i] = Vvdw12 - Vvdw6; - break; - - case GMX_NBKERNEL_VDW_BUCKINGHAM: - gmx_fatal(FARGS, "Buckingham free energy not supported."); - break; - - case GMX_NBKERNEL_VDW_CUBICSPLINETABLE: - /* Table LJ */ - nnn = n1V+4; - /* dispersion */ - Y = VFtab[nnn]; - F = VFtab[nnn+1]; - Geps = epsV*VFtab[nnn+2]; - Heps2 = eps2V*VFtab[nnn+3]; - Fp = F+Geps+Heps2; - VV = Y+epsV*Fp; - FF = Fp+Geps+2.0*Heps2; - Vvdw[i] += c6[i]*VV; - FscalV[i] -= c6[i]*tabscale*FF*rV; - - /* repulsion */ - Y = VFtab[nnn+4]; - F = VFtab[nnn+5]; - Geps = epsV*VFtab[nnn+6]; - Heps2 = eps2V*VFtab[nnn+7]; - Fp = F+Geps+Heps2; - VV = Y+epsV*Fp; - FF = Fp+Geps+2.0*Heps2; - Vvdw[i] += c12[i]*VV; - FscalV[i] -= c12[i]*tabscale*FF*rV; - break; - - case GMX_NBKERNEL_VDW_NONE: - Vvdw[i] = 0.0; - FscalV[i] = 0.0; - break; - - default: - gmx_incons("Invalid ivdw in free energy kernel"); - break; + switch (ivdw) + { + case GMX_NBKERNEL_VDW_LENNARDJONES: + /* cutoff LJ */ + if (sc_r_power == 6.0) + { + rinv6 = rpinvV; + } + else + { + rinv6 = pow(rinvV, 6.0); + } + Vvdw6 = c6[i]*rinv6; + Vvdw12 = c12[i]*rinv6*rinv6; + if (fr->vdw_modifier == eintmodPOTSHIFT) + { + Vvdw[i] = ( (Vvdw12-c12[i]*sh_invrc6*sh_invrc6)*(1.0/12.0) + -(Vvdw6-c6[i]*sh_invrc6)*(1.0/6.0)); + } + else + { + Vvdw[i] = Vvdw12*(1.0/12.0) - Vvdw6*(1.0/6.0); + } + FscalV[i] = Vvdw12 - Vvdw6; + break; + + case GMX_NBKERNEL_VDW_BUCKINGHAM: + gmx_fatal(FARGS, "Buckingham free energy not supported."); + break; + + case GMX_NBKERNEL_VDW_CUBICSPLINETABLE: + /* Table LJ */ + nnn = n1V+4; + /* dispersion */ + Y = VFtab[nnn]; + F = VFtab[nnn+1]; + Geps = epsV*VFtab[nnn+2]; + Heps2 = eps2V*VFtab[nnn+3]; + Fp = F+Geps+Heps2; + VV = Y+epsV*Fp; + FF = Fp+Geps+2.0*Heps2; + Vvdw[i] += c6[i]*VV; + FscalV[i] -= c6[i]*tabscale*FF*rV; + + /* repulsion */ + Y = VFtab[nnn+4]; + F = VFtab[nnn+5]; + Geps = epsV*VFtab[nnn+6]; + Heps2 = eps2V*VFtab[nnn+7]; + Fp = F+Geps+Heps2; + VV = Y+epsV*Fp; + FF = Fp+Geps+2.0*Heps2; + Vvdw[i] += c12[i]*VV; + FscalV[i] -= c12[i]*tabscale*FF*rV; + break; + + case GMX_NBKERNEL_VDW_NONE: + Vvdw[i] = 0.0; + FscalV[i] = 0.0; + break; + + default: + gmx_incons("Invalid ivdw in free energy kernel"); + break; + } + + if (fr->vdw_modifier == eintmodPOTSWITCH) + { + d = rV-rswitch; + d = (d > 0.0) ? d : 0.0; + d2 = d*d; + sw = 1.0+d2*d*(swV3+d*(swV4+d*swV5)); + dsw = d2*(swF2+d*(swF3+d*swF4)); + + Vvdw[i] *= sw; + FscalV[i] = FscalV[i]*sw + Vvdw[i]*dsw; + + FscalV[i] = (rV < rvdw) ? FscalV[i] : 0.0; + Vvdw[i] = (rV < rvdw) ? Vvdw[i] : 0.0; + } } - if (fr->vdw_modifier == eintmodPOTSWITCH) - { - d = rV-rswitch; - d = (d > 0.0) ? d : 0.0; - d2 = d*d; - sw = 1.0+d2*d*(swV3+d*(swV4+d*swV5)); - dsw = d2*(swF2+d*(swF3+d*swF4)); + /* FscalC (and FscalV) now contain: dV/drC * rC + * Now we multiply by rC^-p, so it will be: dV/drC * rC^1-p + * Further down we first multiply by r^p-2 and then by + * the vector r, which in total gives: dV/drC * (r/rC)^1-p + */ + FscalC[i] *= rpinvC; + FscalV[i] *= rpinvV; + } + } - Vvdw[i] *= sw; - FscalV[i] = FscalV[i]*sw + Vvdw[i]*dsw; + /* Assemble A and B states */ + for (i = 0; i < NSTATES; i++) + { + vctot += LFC[i]*Vcoul[i]; + vvtot += LFV[i]*Vvdw[i]; - FscalV[i] = (rV < rvdw) ? FscalV[i] : 0.0; - Vvdw[i] = (rV < rvdw) ? Vvdw[i] : 0.0; - } - } + Fscal += LFC[i]*FscalC[i]*rpm2; + Fscal += LFV[i]*FscalV[i]*rpm2; - /* FscalC (and FscalV) now contain: dV/drC * rC - * Now we multiply by rC^-p, so it will be: dV/drC * rC^1-p - * Further down we first multiply by r^p-2 and then by - * the vector r, which in total gives: dV/drC * (r/rC)^1-p - */ - FscalC[i] *= rpinvC; - FscalV[i] *= rpinvV; + dvdl_coul += Vcoul[i]*DLF[i] + LFC[i]*alpha_coul_eff*dlfac_coul[i]*FscalC[i]*sigma_pow[i]; + dvdl_vdw += Vvdw[i]*DLF[i] + LFV[i]*alpha_vdw_eff*dlfac_vdw[i]*FscalV[i]*sigma_pow[i]; } } + else if (icoul == GMX_NBKERNEL_ELEC_REACTIONFIELD) + { + /* For excluded pairs, which are only in this pair list when + * using the Verlet scheme, we don't use soft-core. + * The group scheme also doesn't soft-core for these. + * As there is no singularity, there is no need for soft-core. + */ + VV = krf*rsq - crf; + FF = -2.0*krf; - Fscal = 0; + if (ii == jnr) + { + VV *= 0.5; + } + + for (i = 0; i < NSTATES; i++) + { + vctot += LFC[i]*qq[i]*VV; + Fscal += LFC[i]*qq[i]*FF; + dvdl_coul += DLF[i]*qq[i]*VV; + } + } if (icoul == GMX_NBKERNEL_ELEC_EWALD && !(bExactElecCutoff && r >= rcoulomb)) @@ -559,6 +672,11 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, FF = f_lr*rinv; VV = tab_ewald_V[ri] - tab_ewald_halfsp*frac*(tab_ewald_F[ri] + f_lr); + if (ii == jnr) + { + VV *= 0.5; + } + for (i = 0; i < NSTATES; i++) { vctot -= LFC[i]*qq[i]*VV; @@ -567,19 +685,6 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, } } - /* Assemble A and B states */ - for (i = 0; i < NSTATES; i++) - { - vctot += LFC[i]*Vcoul[i]; - vvtot += LFV[i]*Vvdw[i]; - - Fscal += LFC[i]*FscalC[i]*rpm2; - Fscal += LFV[i]*FscalV[i]*rpm2; - - dvdl_coul += Vcoul[i]*DLF[i] + LFC[i]*alpha_coul_eff*dlfac_coul[i]*FscalC[i]*sigma_pow[i]; - dvdl_vdw += Vvdw[i]*DLF[i] + LFV[i]*alpha_vdw_eff*dlfac_vdw[i]*FscalV[i]*sigma_pow[i]; - } - if (bDoForces) { tx = Fscal*dx; @@ -588,27 +693,58 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict nlist, fix = fix + tx; fiy = fiy + ty; fiz = fiz + tz; - f[j3] = f[j3] - tx; - f[j3+1] = f[j3+1] - ty; - f[j3+2] = f[j3+2] - tz; + /* OpenMP atomics are expensive, but this kernels is also + * expensive, so we can take this hit, instead of using + * thread-local output buffers and extra reduction. + */ +#pragma omp atomic + f[j3] -= tx; +#pragma omp atomic + f[j3+1] -= ty; +#pragma omp atomic + f[j3+2] -= tz; } } - if (bDoForces) + /* The atomics below are expensive with many OpenMP threads. + * Here unperturbed i-particles will usually only have a few + * (perturbed) j-particles in the list. Thus with a buffered list + * we can skip a significant number of i-reductions with a check. + */ + if (npair_within_cutoff > 0) { - f[ii3] = f[ii3] + fix; - f[ii3+1] = f[ii3+1] + fiy; - f[ii3+2] = f[ii3+2] + fiz; - fshift[is3] = fshift[is3] + fix; - fshift[is3+1] = fshift[is3+1] + fiy; - fshift[is3+2] = fshift[is3+2] + fiz; + if (bDoForces) + { +#pragma omp atomic + f[ii3] += fix; +#pragma omp atomic + f[ii3+1] += fiy; +#pragma omp atomic + f[ii3+2] += fiz; + } + if (bDoShiftForces) + { +#pragma omp atomic + fshift[is3] += fix; +#pragma omp atomic + fshift[is3+1] += fiy; +#pragma omp atomic + fshift[is3+2] += fiz; + } + if (bDoPotential) + { + ggid = gid[n]; +#pragma omp atomic + Vc[ggid] += vctot; +#pragma omp atomic + Vv[ggid] += vvtot; + } } - ggid = gid[n]; - Vc[ggid] = Vc[ggid] + vctot; - Vv[ggid] = Vv[ggid] + vvtot; } +#pragma omp atomic dvdl[efptCOUL] += dvdl_coul; + #pragma omp atomic dvdl[efptVDW] += dvdl_vdw; /* Estimate flops, average for free energy stuff: @@ -672,7 +808,7 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a sigma6[i] = 0.5*c12[i]/c6[i]; sigma2[i] = pow(0.5*c12[i]/c6[i], 1.0/3.0); /* should be able to get rid of this ^^^ internal pow call eventually. Will require agreement on - what data to store externally. Can't be fixed without larger scale changes, so not 4.6 */ + what data to store externally. Can't be fixed without larger scale changes, so not 5.0 */ if (sigma6[i] < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */ { sigma6[i] = sigma6_min; diff --git a/src/gromacs/gmxpreprocess/readir.c b/src/gromacs/gmxpreprocess/readir.c index b63b5fc843..472c517b04 100644 --- a/src/gromacs/gmxpreprocess/readir.c +++ b/src/gromacs/gmxpreprocess/readir.c @@ -498,7 +498,7 @@ void check_ir(const char *mdparin, t_inputrec *ir, t_gromppopts *opts, /* find the smallest of ( nstenergy, nstdhdl ) */ if (ir->efep != efepNO && ir->fepvals->nstdhdl > 0 && - (ir->fepvals->nstdhdl < ir->nstenergy) ) + (ir->nstenergy == 0 || ir->fepvals->nstdhdl < ir->nstenergy)) { min_nst = ir->fepvals->nstdhdl; min_name = nstdh; diff --git a/src/gromacs/legacyheaders/nonbonded.h b/src/gromacs/legacyheaders/nonbonded.h index d92cfd0064..6176d60631 100644 --- a/src/gromacs/legacyheaders/nonbonded.h +++ b/src/gromacs/legacyheaders/nonbonded.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2013, by the GROMACS development team, led by + * Copyright (c) 2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -69,9 +69,10 @@ gmx_nonbonded_set_kernel_pointers(FILE * fplog, #define GMX_NONBONDED_DO_LR (1<<0) #define GMX_NONBONDED_DO_FORCE (1<<1) -#define GMX_NONBONDED_DO_FOREIGNLAMBDA (1<<2) -#define GMX_NONBONDED_DO_POTENTIAL (1<<3) -#define GMX_NONBONDED_DO_SR (1<<4) +#define GMX_NONBONDED_DO_SHIFTFORCE (1<<2) +#define GMX_NONBONDED_DO_FOREIGNLAMBDA (1<<3) +#define GMX_NONBONDED_DO_POTENTIAL (1<<4) +#define GMX_NONBONDED_DO_SR (1<<5) void do_nonbonded(t_forcerec *fr, diff --git a/src/gromacs/legacyheaders/ns.h b/src/gromacs/legacyheaders/ns.h index 7c65628aa3..dafbfa9f6e 100644 --- a/src/gromacs/legacyheaders/ns.h +++ b/src/gromacs/legacyheaders/ns.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2013, by the GROMACS development team, led by + * Copyright (c) 2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -100,6 +100,9 @@ int natoms_beyond_ns_buffer(t_inputrec *ir, t_forcerec *fr, t_block *cgs, matrix scale_tot, rvec *x); /* Returns the number of atoms that moved beyond the ns buffer */ +void reallocate_nblist(t_nblist *nl); +/* List reallocation, only exported for Verlet scheme use with FEP */ + #ifdef __cplusplus } #endif diff --git a/src/gromacs/legacyheaders/types/enums.h b/src/gromacs/legacyheaders/types/enums.h index 5133ea4b73..01e4da7b28 100644 --- a/src/gromacs/legacyheaders/types/enums.h +++ b/src/gromacs/legacyheaders/types/enums.h @@ -112,8 +112,9 @@ enum { #define EEL_RF(e) ((e) == eelRF || (e) == eelGRF || (e) == eelRF_NEC || (e) == eelRF_ZERO ) #define EEL_PME(e) ((e) == eelPME || (e) == eelPMESWITCH || (e) == eelPMEUSER || (e) == eelPMEUSERSWITCH || (e) == eelP3M_AD) -#define EEL_EWALD(e) (EEL_PME(e) || (e) == eelEWALD) -#define EEL_FULL(e) (EEL_PME(e) || (e) == eelPOISSON || (e) == eelEWALD) +#define EEL_PME_EWALD(e) (EEL_PME(e) || (e) == eelEWALD) +#define EEL_FULL(e) (EEL_PME_EWALD(e) || (e) == eelPOISSON) + #define EEL_USER(e) ((e) == eelUSER || (e) == eelPMEUSER || (e) == (eelPMEUSERSWITCH)) enum { diff --git a/src/gromacs/legacyheaders/types/forcerec.h b/src/gromacs/legacyheaders/types/forcerec.h index e1cf22ff95..aa0a976ae4 100644 --- a/src/gromacs/legacyheaders/types/forcerec.h +++ b/src/gromacs/legacyheaders/types/forcerec.h @@ -90,20 +90,26 @@ typedef struct t_nblist nlist_lr[eNL_NR]; } t_nblists; -/* macros for the cginfo data in forcerec */ -/* The maximum cg size in cginfo is 63 +/* macros for the cginfo data in forcerec + * + * Since the tpx format support max 256 energy groups, we do the same here. + * Note that we thus have bits 8-14 still unused. + * + * The maximum cg size in cginfo is 63 * because we only have space for 6 bits in cginfo, * this cg size entry is actually only read with domain decomposition. * But there is a smaller limit due to the t_excl data structure * which is defined in nblist.h. */ -#define SET_CGINFO_GID(cgi, gid) (cgi) = (((cgi) & ~65535) | (gid) ) -#define GET_CGINFO_GID(cgi) ( (cgi) & 65535) +#define SET_CGINFO_GID(cgi, gid) (cgi) = (((cgi) & ~255) | (gid)) +#define GET_CGINFO_GID(cgi) ( (cgi) & 255) +#define SET_CGINFO_FEP(cgi) (cgi) = ((cgi) | (1<<15)) +#define GET_CGINFO_FEP(cgi) ( (cgi) & (1<<15)) #define SET_CGINFO_EXCL_INTRA(cgi) (cgi) = ((cgi) | (1<<16)) #define GET_CGINFO_EXCL_INTRA(cgi) ( (cgi) & (1<<16)) #define SET_CGINFO_EXCL_INTER(cgi) (cgi) = ((cgi) | (1<<17)) #define GET_CGINFO_EXCL_INTER(cgi) ( (cgi) & (1<<17)) -#define SET_CGINFO_SOLOPT(cgi, opt) (cgi) = (((cgi) & ~(3<<18)) | ((opt)<<18)) +#define SET_CGINFO_SOLOPT(cgi, opt) (cgi) = (((cgi) & ~(3<<18)) | ((opt)<<18)) #define GET_CGINFO_SOLOPT(cgi) (((cgi)>>18) & 3) #define SET_CGINFO_CONSTR(cgi) (cgi) = ((cgi) | (1<<20)) #define GET_CGINFO_CONSTR(cgi) ( (cgi) & (1<<20)) @@ -116,7 +122,7 @@ typedef struct #define GET_CGINFO_HAS_VDW(cgi) ( (cgi) & (1<<23)) #define SET_CGINFO_HAS_Q(cgi) (cgi) = ((cgi) | (1<<24)) #define GET_CGINFO_HAS_Q(cgi) ( (cgi) & (1<<24)) -#define SET_CGINFO_NATOMS(cgi, opt) (cgi) = (((cgi) & ~(63<<25)) | ((opt)<<25)) +#define SET_CGINFO_NATOMS(cgi, opt) (cgi) = (((cgi) & ~(63<<25)) | ((opt)<<25)) #define GET_CGINFO_NATOMS(cgi) (((cgi)>>25) & 63) diff --git a/src/gromacs/legacyheaders/types/nblist.h b/src/gromacs/legacyheaders/types/nblist.h index 540ab77a16..df4b290ba1 100644 --- a/src/gromacs/legacyheaders/types/nblist.h +++ b/src/gromacs/legacyheaders/types/nblist.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2012, by the GROMACS development team, led by + * Copyright (c) 2012,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -66,7 +66,6 @@ typedef struct int nri, maxnri; /* Current/max number of i particles */ int nrj, maxnrj; /* Current/max number of j particles */ - int maxlen; /* maxnr of j atoms for a single i atom */ int * iinr; /* The i-elements */ int * iinr_end; /* The end atom, only with enlistCG */ int * gid; /* Index in energy arrays */ @@ -74,6 +73,7 @@ typedef struct int * jindex; /* Index in jjnr */ int * jjnr; /* The j-atom list */ int * jjnr_end; /* The end atom, only with enltypeCG */ + char * excl_fep; /* Exclusions for FEP with Verlet scheme */ t_excl * excl; /* Exclusions, only with enltypeCG */ /* We use separate pointers for kernels that compute both potential diff --git a/src/gromacs/legacyheaders/types/nbnxn_pairlist.h b/src/gromacs/legacyheaders/types/nbnxn_pairlist.h index 2de1dde4ea..0748804c37 100644 --- a/src/gromacs/legacyheaders/types/nbnxn_pairlist.h +++ b/src/gromacs/legacyheaders/types/nbnxn_pairlist.h @@ -37,8 +37,9 @@ #define _nbnxn_pairlist_h #ifdef HAVE_CONFIG_H -# include +#include #endif +#include "nblist.h" #include "../thread_mpi/atomic.h" @@ -77,11 +78,11 @@ typedef void nbnxn_free_t (void *ptr); * is found, all subsequent j-entries in the i-entry also have full masks. */ typedef struct { - int cj; /* The j-cluster */ - unsigned excl; /* The exclusion (interaction) bits */ + int cj; /* The j-cluster */ + unsigned int excl; /* The exclusion (interaction) bits */ #ifdef GMX_SIMD_IBM_QPX /* Indices into the arrays of SIMD interaction masks. */ - char interaction_mask_indices[4]; + char interaction_mask_indices[4]; #endif } nbnxn_cj_t; @@ -115,8 +116,8 @@ typedef struct { } nbnxn_sci_t; typedef struct { - unsigned imask; /* The i-cluster interactions mask for 1 warp */ - int excl_ind; /* Index into the exclusion array for 1 warp */ + unsigned int imask; /* The i-cluster interactions mask for 1 warp */ + int excl_ind; /* Index into the exclusion array for 1 warp */ } nbnxn_im_ei_t; typedef struct { @@ -125,7 +126,7 @@ typedef struct { } nbnxn_cj4_t; typedef struct { - unsigned pair[32]; /* Topology exclusion interaction bits for one warp, + unsigned int pair[32]; /* Topology exclusion interaction bits for one warp, * each unsigned has bitS for 4*8 i clusters */ } nbnxn_excl_t; @@ -176,6 +177,7 @@ typedef struct { int natpair_ljq; /* Total number of atom pairs for LJ+Q kernel */ int natpair_lj; /* Total number of atom pairs for LJ kernel */ int natpair_q; /* Total number of atom pairs for Q kernel */ + t_nblist **nbl_fep; } nbnxn_pairlist_set_t; enum { @@ -217,9 +219,9 @@ typedef struct { /* Flags for telling if threads write to force output buffers */ typedef struct { - int nflag; /* The number of flag blocks */ - unsigned *flag; /* Bit i is set when thread i writes to a cell-block */ - int flag_nalloc; /* Allocation size of cxy_flag */ + int nflag; /* The number of flag blocks */ + unsigned int *flag; /* Bit i is set when thread i writes to a cell-block */ + int flag_nalloc; /* Allocation size of cxy_flag */ } nbnxn_buffer_flags_t; /* LJ combination rules: geometric, Lorentz-Berthelot, none */ @@ -262,8 +264,8 @@ typedef struct { /* Filters for topology exclusion masks for the SIMD kernels. * filter2 is the same as filter1, but with each element duplicated. */ - unsigned *simd_exclusion_filter1; - unsigned *simd_exclusion_filter2; + unsigned int *simd_exclusion_filter1; + unsigned int *simd_exclusion_filter2; #ifdef GMX_SIMD_IBM_QPX real *simd_interaction_array; /* Array of masks needed for exclusions on QPX */ #endif diff --git a/src/gromacs/mdlib/force.c b/src/gromacs/mdlib/force.c index 93a9d06936..3001236712 100644 --- a/src/gromacs/mdlib/force.c +++ b/src/gromacs/mdlib/force.c @@ -261,10 +261,15 @@ void do_force_lowlevel(FILE *fplog, gmx_int64_t step, /* Add short-range interactions */ donb_flags |= GMX_NONBONDED_DO_SR; + /* Currently all group scheme kernels always calculate (shift-)forces */ if (flags & GMX_FORCE_FORCES) { donb_flags |= GMX_NONBONDED_DO_FORCE; } + if (flags & GMX_FORCE_VIRIAL) + { + donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE; + } if (flags & GMX_FORCE_ENERGY) { donb_flags |= GMX_NONBONDED_DO_POTENTIAL; @@ -469,7 +474,7 @@ void do_force_lowlevel(FILE *fplog, gmx_int64_t step, real dvdl_long_range_q = 0, dvdl_long_range_lj = 0; int status = 0; - if (EEL_EWALD(fr->eeltype) || EVDW_PME(fr->vdwtype)) + if (EEL_PME_EWALD(fr->eeltype) || EVDW_PME(fr->vdwtype)) { real dvdl_long_range_correction_q = 0; real dvdl_long_range_correction_lj = 0; @@ -561,7 +566,7 @@ void do_force_lowlevel(FILE *fplog, gmx_int64_t step, wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION); } - if (EEL_EWALD(fr->eeltype) && fr->n_tpi == 0) + if (EEL_PME_EWALD(fr->eeltype) && fr->n_tpi == 0) { Vcorr_q += ewald_charge_correction(cr, fr, lambda[efptCOUL], box, &dvdl_long_range_correction_q, @@ -657,7 +662,7 @@ void do_force_lowlevel(FILE *fplog, gmx_int64_t step, } } - if (!EEL_PME(fr->eeltype) && EEL_EWALD(fr->eeltype)) + if (!EEL_PME(fr->eeltype) && EEL_PME_EWALD(fr->eeltype)) { Vlr_q = do_ewald(ir, x, fr->f_novirsum, md->chargeA, md->chargeB, diff --git a/src/gromacs/mdlib/forcerec.c b/src/gromacs/mdlib/forcerec.c index 8d18cb7a24..954c6a714a 100644 --- a/src/gromacs/mdlib/forcerec.c +++ b/src/gromacs/mdlib/forcerec.c @@ -591,6 +591,7 @@ enum { static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop, t_forcerec *fr, gmx_bool bNoSolvOpt, + gmx_bool *bFEP_NonBonded, gmx_bool *bExcl_IntraCGAll_InterCGNone) { const t_block *cgs; @@ -605,7 +606,7 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop, int *a_con; int ftype; int ia; - gmx_bool bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ; + gmx_bool bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ, bFEP; ncg_tot = ncg_mtop(mtop); snew(cginfo_mb, mtop->nmolblock); @@ -623,6 +624,7 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop, } } + *bFEP_NonBonded = FALSE; *bExcl_IntraCGAll_InterCGNone = TRUE; excl_nalloc = 10; @@ -722,6 +724,7 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop, bExclInter = FALSE; bHaveVDW = FALSE; bHaveQ = FALSE; + bFEP = FALSE; for (ai = a0; ai < a1; ai++) { /* Check VDW and electrostatic interactions */ @@ -730,6 +733,8 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop, bHaveQ = bHaveQ || (molt->atoms.atom[ai].q != 0 || molt->atoms.atom[ai].qB != 0); + bFEP = bFEP || (PERTURBED(molt->atoms.atom[ai]) != 0); + /* Clear the exclusion list for atom ai */ for (aj = a0; aj < a1; aj++) { @@ -790,6 +795,11 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop, { SET_CGINFO_HAS_Q(cginfo[cgm+cg]); } + if (bFEP) + { + SET_CGINFO_FEP(cginfo[cgm+cg]); + *bFEP_NonBonded = TRUE; + } /* Store the charge group size */ SET_CGINFO_NATOMS(cginfo[cgm+cg], a1-a0); @@ -1592,7 +1602,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir, *kernel_type = nbnxnk4xN_SIMD_4xN; #ifndef GMX_SIMD_HAVE_FMA - if (EEL_PME(ir->coulombtype) || EEL_EWALD(ir->coulombtype) || + if (EEL_PME_EWALD(ir->coulombtype) || EVDW_PME(ir->vdwtype)) { /* We have Ewald kernels without FMA (Intel Sandy/Ivy Bridge). @@ -2080,6 +2090,7 @@ init_interaction_const(FILE *fp, static void init_nb_verlet(FILE *fp, nonbonded_verlet_t **nb_verlet, + gmx_bool bFEP_NonBonded, const t_inputrec *ir, const t_forcerec *fr, const t_commrec *cr, @@ -2185,6 +2196,7 @@ static void init_nb_verlet(FILE *fp, nbnxn_init_search(&nbv->nbs, DOMAINDECOMP(cr) ? &cr->dd->nc : NULL, DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL, + bFEP_NonBonded, gmx_omp_nthreads_get(emntNonbonded)); for (i = 0; i < nbv->ngrp; i++) @@ -2278,6 +2290,7 @@ void init_forcerec(FILE *fp, const t_block *cgs; gmx_bool bGenericKernelOnly; gmx_bool bMakeTables, bMakeSeparate14Table, bSomeNormalNbListsAreInUse; + gmx_bool bFEP_NonBonded; t_nblists *nbl; int *nm_ind, egp_flags; @@ -3098,6 +3111,7 @@ void init_forcerec(FILE *fp, /* Set all the static charge group info */ fr->cginfo_mb = init_cginfo_mb(fp, mtop, fr, bNoSolvOpt, + &bFEP_NonBonded, &fr->bExcl_IntraCGAll_InterCGNone); if (DOMAINDECOMP(cr)) { @@ -3148,7 +3162,7 @@ void init_forcerec(FILE *fp, gmx_fatal(FARGS, "With Verlet lists rcoulomb and rvdw should be identical"); } - init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt); + init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt); } /* fr->ic is used both by verlet and group kernels (to some extent) now */ diff --git a/src/gromacs/mdlib/genborn.c b/src/gromacs/mdlib/genborn.c index 7c33afa186..ff228f9529 100644 --- a/src/gromacs/mdlib/genborn.c +++ b/src/gromacs/mdlib/genborn.c @@ -107,7 +107,6 @@ static int init_gb_nblist(int natoms, t_nblist *nl) { nl->maxnri = natoms*4; nl->maxnrj = 0; - nl->maxlen = 0; nl->nri = 0; nl->nrj = 0; nl->iinr = NULL; diff --git a/src/gromacs/mdlib/nbnxn_atomdata.c b/src/gromacs/mdlib/nbnxn_atomdata.c index 2d9e10559d..ba0b9a5bbc 100644 --- a/src/gromacs/mdlib/nbnxn_atomdata.c +++ b/src/gromacs/mdlib/nbnxn_atomdata.c @@ -941,6 +941,67 @@ static void nbnxn_atomdata_set_charges(nbnxn_atomdata_t *nbat, } } +/* Set the charges of perturbed atoms in nbnxn_atomdata_t to 0. + * This is to automatically remove the RF/PME self term in the nbnxn kernels. + * Part of the zero interactions are still calculated in the normal kernels. + * All perturbed interactions are calculated in the free energy kernel, + * using the original charge and LJ data, not nbnxn_atomdata_t. + */ +static void nbnxn_atomdata_mask_fep(nbnxn_atomdata_t *nbat, + int ngrid, + const nbnxn_search_t nbs) +{ + real *q; + int stride_q, g, nsubc, c_offset, c, subc, i, ind; + const nbnxn_grid_t *grid; + + if (nbat->XFormat == nbatXYZQ) + { + q = nbat->x + ZZ + 1; + stride_q = STRIDE_XYZQ; + } + else + { + q = nbat->q; + stride_q = 1; + } + + for (g = 0; g < ngrid; g++) + { + grid = &nbs->grid[g]; + if (grid->bSimple) + { + nsubc = 1; + } + else + { + nsubc = GPU_NSUBCELL; + } + + c_offset = grid->cell0*grid->na_sc; + + /* Loop over all columns and copy and fill */ + for (c = 0; c < grid->nc*nsubc; c++) + { + /* Does this cluster contain perturbed particles? */ + if (grid->fep[c] != 0) + { + for (i = 0; i < grid->na_c; i++) + { + /* Is this a perturbed particle? */ + if (grid->fep[c] & (1 << i)) + { + ind = c_offset + c*grid->na_c + i; + /* Set atom type and charge to non-interacting */ + nbat->type[ind] = nbat->ntype - 1; + q[ind*stride_q] = 0; + } + } + } + } + } +} + /* Copies the energy group indices to a reordered and packed array */ static void copy_egp_to_nbat_egps(const int *a, int na, int na_round, int na_c, int bit_shift, @@ -980,6 +1041,11 @@ static void nbnxn_atomdata_set_energygroups(nbnxn_atomdata_t *nbat, int g, i, ncz, ash; const nbnxn_grid_t *grid; + if (nbat->nenergrp == 1) + { + return; + } + for (g = 0; g < ngrid; g++) { grid = &nbs->grid[g]; @@ -1019,10 +1085,12 @@ void nbnxn_atomdata_set(nbnxn_atomdata_t *nbat, nbnxn_atomdata_set_charges(nbat, ngrid, nbs, mdatoms->chargeA); - if (nbat->nenergrp > 1) + if (nbs->bFEP) { - nbnxn_atomdata_set_energygroups(nbat, ngrid, nbs, atinfo); + nbnxn_atomdata_mask_fep(nbat, ngrid, nbs); } + + nbnxn_atomdata_set_energygroups(nbat, ngrid, nbs, atinfo); } /* Copies the shift vector array to nbnxn_atomdata_t */ diff --git a/src/gromacs/mdlib/nbnxn_consts.h b/src/gromacs/mdlib/nbnxn_consts.h index 4c790a42f4..f5bd3d01ab 100644 --- a/src/gromacs/mdlib/nbnxn_consts.h +++ b/src/gromacs/mdlib/nbnxn_consts.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014 by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -102,15 +102,15 @@ extern "C" { * Bit i*CJ_SIZE + j tells if atom i and j interact. */ /* All interaction mask is the same for all kernels */ -#define NBNXN_INTERACTION_MASK_ALL 0xffffffff +static const unsigned int NBNXN_INTERACTION_MASK_ALL = 0xffffffffU; /* 4x4 kernel diagonal mask */ -#define NBNXN_INTERACTION_MASK_DIAG 0x08ce +static const unsigned int NBNXN_INTERACTION_MASK_DIAG = 0x08ceU; /* 4x2 kernel diagonal masks */ -#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002 -#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002F +static const unsigned int NBNXN_INTERACTION_MASK_DIAG_J2_0 = 0x0002U; +static const unsigned int NBNXN_INTERACTION_MASK_DIAG_J2_1 = 0x002fU; /* 4x8 kernel diagonal masks */ -#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfe -#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0 +static const unsigned int NBNXN_INTERACTION_MASK_DIAG_J8_0 = 0xf0f8fcfeU; +static const unsigned int NBNXN_INTERACTION_MASK_DIAG_J8_1 = 0x0080c0e0U; #ifdef __cplusplus diff --git a/src/gromacs/mdlib/nbnxn_internal.h b/src/gromacs/mdlib/nbnxn_internal.h index 8c366ee8cb..fc42e60935 100644 --- a/src/gromacs/mdlib/nbnxn_internal.h +++ b/src/gromacs/mdlib/nbnxn_internal.h @@ -95,46 +95,47 @@ typedef struct { /* A pair-search grid struct for one domain decomposition zone */ typedef struct { - rvec c0; /* The lower corner of the (local) grid */ - rvec c1; /* The upper corner of the (local) grid */ - real atom_density; /* The atom number density for the local grid */ - - gmx_bool bSimple; /* Is this grid simple or super/sub */ - int na_c; /* Number of atoms per cluster */ - int na_cj; /* Number of atoms for list j-clusters */ - int na_sc; /* Number of atoms per super-cluster */ - int na_c_2log; /* 2log of na_c */ - - int ncx; /* Number of (super-)cells along x */ - int ncy; /* Number of (super-)cells along y */ - int nc; /* Total number of (super-)cells */ - - real sx; /* x-size of a (super-)cell */ - real sy; /* y-size of a (super-)cell */ - real inv_sx; /* 1/sx */ - real inv_sy; /* 1/sy */ - - int cell0; /* Index in nbs->cell corresponding to cell 0 */ - - int *cxy_na; /* The number of atoms for each column in x,y */ - int *cxy_ind; /* Grid (super)cell index, offset from cell0 */ - int cxy_nalloc; /* Allocation size for cxy_na and cxy_ind */ - - int *nsubc; /* The number of sub cells for each super cell */ - float *bbcz; /* Bounding boxes in z for the super cells */ - nbnxn_bb_t *bb; /* 3D bounding boxes for the sub cells */ - nbnxn_bb_t *bbj; /* 3D j-bounding boxes for the case where * - * the i- and j-cluster sizes are different */ - float *pbb; /* 3D b. boxes in xxxx format per super cell */ - int *flags; /* Flag for the super cells */ - int nc_nalloc; /* Allocation size for the pointers above */ - - float *bbcz_simple; /* bbcz for simple grid converted from super */ - nbnxn_bb_t *bb_simple; /* bb for simple grid converted from super */ - int *flags_simple; /* flags for simple grid converted from super */ - int nc_nalloc_simple; /* Allocation size for the pointers above */ - - int nsubc_tot; /* Total number of subcell, used for printing */ + rvec c0; /* The lower corner of the (local) grid */ + rvec c1; /* The upper corner of the (local) grid */ + real atom_density; /* The atom number density for the local grid */ + + gmx_bool bSimple; /* Is this grid simple or super/sub */ + int na_c; /* Number of atoms per cluster */ + int na_cj; /* Number of atoms for list j-clusters */ + int na_sc; /* Number of atoms per super-cluster */ + int na_c_2log; /* 2log of na_c */ + + int ncx; /* Number of (super-)cells along x */ + int ncy; /* Number of (super-)cells along y */ + int nc; /* Total number of (super-)cells */ + + real sx; /* x-size of a (super-)cell */ + real sy; /* y-size of a (super-)cell */ + real inv_sx; /* 1/sx */ + real inv_sy; /* 1/sy */ + + int cell0; /* Index in nbs->cell corresponding to cell 0 */ + + int *cxy_na; /* The number of atoms for each column in x,y */ + int *cxy_ind; /* Grid (super)cell index, offset from cell0 */ + int cxy_nalloc; /* Allocation size for cxy_na and cxy_ind */ + + int *nsubc; /* The number of sub cells for each super cell */ + float *bbcz; /* Bounding boxes in z for the super cells */ + nbnxn_bb_t *bb; /* 3D bounding boxes for the sub cells */ + nbnxn_bb_t *bbj; /* 3D j-bounding boxes for the case where * + * the i- and j-cluster sizes are different */ + float *pbb; /* 3D b. boxes in xxxx format per super cell */ + int *flags; /* Flag for the super cells */ + unsigned int *fep; /* FEP signal bits for sub cells */ + int nc_nalloc; /* Allocation size for the pointers above */ + + float *bbcz_simple; /* bbcz for simple grid converted from super */ + nbnxn_bb_t *bb_simple; /* bb for simple grid converted from super */ + int *flags_simple; /* flags for simple grid converted from super */ + int nc_nalloc_simple; /* Allocation size for the pointers above */ + + int nsubc_tot; /* Total number of subcell, used for printing */ } nbnxn_grid_t; #ifdef GMX_NBNXN_SIMD @@ -230,6 +231,8 @@ typedef struct { int ndistc; /* Number of distance checks for flop counting */ + t_nblist *nbl_fep; /* Temporary FEP list for load balancing */ + nbnxn_cycle_t cc[enbsCCnr]; gmx_cache_protect_t cp1; @@ -237,6 +240,7 @@ typedef struct { /* Main pair-search struct, contains the grid(s), not the pair-list(s) */ typedef struct nbnxn_search { + gmx_bool bFEP; /* Do we have perturbed atoms? */ int ePBC; /* PBC type enum */ matrix box; /* The periodic unit-cell */ diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c index 2f93f3036c..4f52448ee0 100644 --- a/src/gromacs/mdlib/nbnxn_search.c +++ b/src/gromacs/mdlib/nbnxn_search.c @@ -39,6 +39,8 @@ #include #include +#include + #include "sysstuff.h" #include "smalloc.h" #include "macros.h" @@ -55,6 +57,7 @@ #include "nbnxn_search.h" #include "gmx_omp_nthreads.h" #include "nrnb.h" +#include "ns.h" #include "gromacs/fileio/gmxfio.h" @@ -319,9 +322,34 @@ gmx_bool nbnxn_kernel_pairlist_simple(int nb_kernel_type) } } +/* Initializes a single nbnxn_pairlist_t data structure */ +static void nbnxn_init_pairlist_fep(t_nblist *nl) +{ + nl->type = GMX_NBLIST_INTERACTION_FREE_ENERGY; + nl->igeometry = GMX_NBLIST_GEOMETRY_PARTICLE_PARTICLE; + /* The interaction functions are set in the free energy kernel fuction */ + nl->ivdw = -1; + nl->ivdwmod = -1; + nl->ielec = -1; + nl->ielecmod = -1; + + nl->maxnri = 0; + nl->maxnrj = 0; + nl->nri = 0; + nl->nrj = 0; + nl->iinr = NULL; + nl->gid = NULL; + nl->shift = NULL; + nl->jindex = NULL; + nl->jjnr = NULL; + nl->excl_fep = NULL; + +} + void nbnxn_init_search(nbnxn_search_t * nbs_ptr, ivec *n_dd_cells, gmx_domdec_zones_t *zones, + gmx_bool bFEP, int nthread_max) { nbnxn_search_t nbs; @@ -330,6 +358,8 @@ void nbnxn_init_search(nbnxn_search_t * nbs_ptr, snew(nbs, 1); *nbs_ptr = nbs; + nbs->bFEP = bFEP; + nbs->DomDec = (n_dd_cells != NULL); clear_ivec(nbs->dd_dim); @@ -369,6 +399,9 @@ void nbnxn_init_search(nbnxn_search_t * nbs_ptr, nbs->work[t].cxy_na_nalloc = 0; nbs->work[t].sort_work = NULL; nbs->work[t].sort_work_nalloc = 0; + + snew(nbs->work[t].nbl_fep, 1); + nbnxn_init_pairlist_fep(nbs->work[t].nbl_fep); } /* Initialize detailed nbsearch cycle counting */ @@ -521,6 +554,10 @@ static int set_grid_size_xy(const nbnxn_search_t nbs, } srenew(grid->flags, grid->nc_nalloc); + if (nbs->bFEP) + { + srenew(grid->fep, grid->nc_nalloc*grid->na_sc/grid->na_c); + } } copy_rvec(corner0, grid->c0); @@ -1052,7 +1089,7 @@ void sort_on_lj(int na_c, int subc, s, a, n1, n2, a_lj_max, i, j; int sort1[NBNXN_NA_SC_MAX/GPU_NSUBCELL]; int sort2[NBNXN_NA_SC_MAX/GPU_NSUBCELL]; - gmx_bool haveQ; + gmx_bool haveQ, bFEP; *flags = 0; @@ -1079,7 +1116,7 @@ void sort_on_lj(int na_c, } } - /* If we don't have atom with LJ, there's nothing to sort */ + /* If we don't have atoms with LJ, there's nothing to sort */ if (n1 > 0) { *flags |= NBNXN_CI_DO_LJ(subc); @@ -1140,6 +1177,23 @@ void fill_cell(const nbnxn_search_t nbs, grid->flags+(a0>>grid->na_c_2log)-grid->cell0); } + if (nbs->bFEP) + { + /* Set the fep flag for perturbed atoms in this (sub-)cell */ + int c, at; + + /* The grid-local cluster/(sub-)cell index */ + c = (a0 >> grid->na_c_2log) - grid->cell0*(grid->bSimple ? 1 : GPU_NSUBCELL); + grid->fep[c] = 0; + for (at = a0; at < a1; at++) + { + if (nbs->a[at] >= 0 && GET_CGINFO_FEP(atinfo[nbs->a[at]])) + { + grid->fep[c] |= (1 << (at - a0)); + } + } + } + /* Now we have sorted the atoms, set the cell indices */ for (a = a0; a < a1; a++) { @@ -2323,7 +2377,7 @@ static int nbl_cj(const nbnxn_pairlist_t *nbl, int cj_ind) } /* Returns the i-interaction mask of the j sub-cell for index cj_ind */ -static unsigned nbl_imask0(const nbnxn_pairlist_t *nbl, int cj_ind) +static unsigned int nbl_imask0(const nbnxn_pairlist_t *nbl, int cj_ind) { return nbl->cj4[cj_ind >> NBNXN_GPU_JGROUP_SIZE_2LOG].imei[0].imask; } @@ -2506,6 +2560,7 @@ void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list, } snew(nbl_list->nbl, nbl_list->nnbl); + snew(nbl_list->nbl_fep, nbl_list->nnbl); /* Execute in order to avoid memory interleaving between threads */ #pragma omp parallel for num_threads(nbl_list->nnbl) schedule(static) for (i = 0; i < nbl_list->nnbl; i++) @@ -2524,6 +2579,9 @@ void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list, { nbnxn_init_pairlist(nbl_list->nbl[i], nbl_list->bSimple, NULL, NULL); } + + snew(nbl_list->nbl_fep[i], 1); + nbnxn_init_pairlist_fep(nbl_list->nbl_fep[i]); } } @@ -2650,7 +2708,7 @@ static void low_get_nbl_exclusions(nbnxn_pairlist_t *nbl, int cj4, } /* Returns a pointer to the exclusion mask for cj4-unit cj4, warp warp, - * allocates extra memory, if necessary. + * generates a new element and allocates extra memory, if necessary. */ static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl, int cj4, int warp, nbnxn_excl_t **excl) @@ -2664,7 +2722,7 @@ static void get_nbl_exclusions_1(nbnxn_pairlist_t *nbl, int cj4, } /* Returns pointers to the exclusion mask for cj4-unit cj4 for both warps, - * allocates extra memory, if necessary. + * generates a new element and allocates extra memory, if necessary. */ static void get_nbl_exclusions_2(nbnxn_pairlist_t *nbl, int cj4, nbnxn_excl_t **excl_w0, @@ -2887,7 +2945,7 @@ static void make_cluster_list_supersub(const nbnxn_grid_t *gridi, int npair; int cjo, ci1, ci, cj, cj_gl; int cj4_ind, cj_offset; - unsigned imask; + unsigned int imask; nbnxn_cj4_t *cj4; #ifdef NBNXN_BBXXXX const float *pbb_ci; @@ -3193,6 +3251,398 @@ static void set_ci_top_excls(const nbnxn_search_t nbs, } } +/* Add a new i-entry to the FEP list and copy the i-properties */ +static gmx_inline void fep_list_new_nri_copy(t_nblist *nlist) +{ + /* Add a new i-entry */ + nlist->nri++; + + assert(nlist->nri < nlist->maxnri); + + /* Duplicate the last i-entry, except for jindex, which continues */ + nlist->iinr[nlist->nri] = nlist->iinr[nlist->nri-1]; + nlist->shift[nlist->nri] = nlist->shift[nlist->nri-1]; + nlist->gid[nlist->nri] = nlist->gid[nlist->nri-1]; + nlist->jindex[nlist->nri] = nlist->nrj; +} + +/* For load balancing of the free-energy lists over threads, we set + * the maximum nrj size of an i-entry to 40. This leads to good + * load balancing in the worst case scenario of a single perturbed + * particle on 16 threads, while not introducing significant overhead. + * Note that half of the perturbed pairs will anyhow end up in very small lists, + * since non perturbed i-particles will see few perturbed j-particles). + */ +const int max_nrj_fep = 40; + +/* Exclude the perturbed pairs from the Verlet list. This is only done to avoid + * singularities for overlapping particles (0/0), since the charges and + * LJ parameters have been zeroed in the nbnxn data structure. + * Simultaneously make a group pair list for the perturbed pairs. + */ +static void make_fep_list(const nbnxn_search_t nbs, + const nbnxn_atomdata_t *nbat, + nbnxn_pairlist_t *nbl, + gmx_bool bDiagRemoved, + nbnxn_ci_t *nbl_ci, + const nbnxn_grid_t *gridi, + const nbnxn_grid_t *gridj, + t_nblist *nlist) +{ + int ci, cj_ind_start, cj_ind_end, cj_ind, cja, cjr; + int nri_max; + int ngid, gid_i = 0, gid_j, gid; + int egp_shift, egp_mask; + int gid_cj = 0; + int i, j, ind_i, ind_j, ai, aj; + int nri; + gmx_bool bFEP_i, bFEP_i_all; + + if (nbl_ci->cj_ind_end == nbl_ci->cj_ind_start) + { + /* Empty list */ + return; + } + + ci = nbl_ci->ci; + + cj_ind_start = nbl_ci->cj_ind_start; + cj_ind_end = nbl_ci->cj_ind_end; + + /* In worst case we have alternating energy groups and create npair lists */ + nri_max = nbl->na_ci*(cj_ind_end - cj_ind_start); + if (nlist->nri + nri_max > nlist->maxnri) + { + nlist->maxnri = over_alloc_large(nlist->nri + nri_max); + reallocate_nblist(nlist); + } + + ngid = nbat->nenergrp; + + if (ngid*gridj->na_cj > sizeof(gid_cj)*8) + { + gmx_fatal(FARGS, "The Verlet scheme with %dx%d kernels and free-energy only supports up to %d energy groups", + gridi->na_c, gridj->na_cj, (sizeof(gid_cj)*8)/gridj->na_cj); + } + + egp_shift = nbat->neg_2log; + egp_mask = (1<neg_2log) - 1; + + /* Loop over the atoms in the i sub-cell */ + bFEP_i_all = TRUE; + for (i = 0; i < nbl->na_ci; i++) + { + ind_i = ci*nbl->na_ci + i; + ai = nbs->a[ind_i]; + if (ai >= 0) + { + nri = nlist->nri; + nlist->jindex[nri+1] = nlist->jindex[nri]; + nlist->iinr[nri] = ai; + /* The actual energy group pair index is set later */ + nlist->gid[nri] = 0; + nlist->shift[nri] = nbl_ci->shift & NBNXN_CI_SHIFT; + + bFEP_i = gridi->fep[ci - gridi->cell0] & (1 << i); + + bFEP_i_all = bFEP_i_all && bFEP_i; + + if ((nlist->nrj + cj_ind_end - cj_ind_start)*nbl->na_cj > nlist->maxnrj) + { + nlist->maxnrj = over_alloc_small((nlist->nrj + cj_ind_end - cj_ind_start)*nbl->na_cj); + srenew(nlist->jjnr, nlist->maxnrj); + srenew(nlist->excl_fep, nlist->maxnrj); + } + + if (ngid > 1) + { + gid_i = (nbat->energrp[ci] >> (egp_shift*i)) & egp_mask; + } + + for (cj_ind = cj_ind_start; cj_ind < cj_ind_end; cj_ind++) + { + unsigned int fep_cj; + + cja = nbl->cj[cj_ind].cj; + + if (gridj->na_cj == gridj->na_c) + { + cjr = cja - gridj->cell0; + fep_cj = gridj->fep[cjr]; + if (ngid > 1) + { + gid_cj = nbat->energrp[cja]; + } + } + else if (2*gridj->na_cj == gridj->na_c) + { + cjr = cja - gridj->cell0*2; + /* Extract half of the ci fep/energrp mask */ + fep_cj = (gridj->fep[cjr>>1] >> ((cjr&1)*gridj->na_cj)) & ((1<na_cj) - 1); + if (ngid > 1) + { + gid_cj = nbat->energrp[cja>>1] >> ((cja&1)*gridj->na_cj*egp_shift) & ((1<<(gridj->na_cj*egp_shift)) - 1); + } + } + else + { + cjr = cja - (gridj->cell0>>1); + /* Combine two ci fep masks/energrp */ + fep_cj = gridj->fep[cjr*2] + (gridj->fep[cjr*2+1] << gridj->na_c); + if (ngid > 1) + { + gid_cj = nbat->energrp[cja*2] + (nbat->energrp[cja*2+1] << (gridj->na_c*egp_shift)); + } + } + + if (bFEP_i || fep_cj != 0) + { + for (j = 0; j < nbl->na_cj; j++) + { + /* Is this interaction perturbed and not excluded? */ + ind_j = cja*nbl->na_cj + j; + aj = nbs->a[ind_j]; + if (aj >= 0 && + (bFEP_i || (fep_cj & (1 << j))) && + (!bDiagRemoved || ind_j >= ind_i)) + { + if (ngid > 1) + { + gid_j = (gid_cj >> (j*egp_shift)) & egp_mask; + gid = GID(gid_i, gid_j, ngid); + + if (nlist->nrj > nlist->jindex[nri] && + nlist->gid[nri] != gid) + { + /* Energy group pair changed: new list */ + fep_list_new_nri_copy(nlist); + nri = nlist->nri; + } + nlist->gid[nri] = gid; + } + + if (nlist->nrj - nlist->jindex[nri] >= max_nrj_fep) + { + fep_list_new_nri_copy(nlist); + nri = nlist->nri; + } + + /* Add it to the FEP list */ + nlist->jjnr[nlist->nrj] = aj; + nlist->excl_fep[nlist->nrj] = (nbl->cj[cj_ind].excl >> (i*nbl->na_cj + j)) & 1; + nlist->nrj++; + + /* Exclude it from the normal list. + * Note that the charge has been set to zero, + * but we need to avoid 0/0, as perturbed atoms + * can be on top of each other. + * (and the LJ parameters have not been zeroed) + */ + nbl->cj[cj_ind].excl &= ~(1U << (i*nbl->na_cj + j)); + } + } + } + } + + if (nlist->nrj > nlist->jindex[nri]) + { + nlist->nri++; + nlist->jindex[nlist->nri] = nlist->nrj; + } + } + } + + if (bFEP_i_all) + { + /* All interactions are perturbed, we can skip this entry */ + nbl_ci->cj_ind_end = cj_ind_start; + } +} + +/* Return the index of atom a within a cluster */ +static gmx_inline int cj_mod_cj4(int cj) +{ + return cj & (NBNXN_GPU_JGROUP_SIZE - 1); +} + +/* Convert a j-cluster to a cj4 group */ +static gmx_inline int cj_to_cj4(int cj) +{ + return cj >> NBNXN_GPU_JGROUP_SIZE_2LOG; +} + +/* Return the index of an j-atom within a warp */ +static gmx_inline int a_mod_wj(int a) +{ + return a & (NBNXN_GPU_CLUSTER_SIZE/2 - 1); +} + +/* As make_fep_list above, but for super/sub lists. */ +static void make_fep_list_supersub(const nbnxn_search_t nbs, + const nbnxn_atomdata_t *nbat, + nbnxn_pairlist_t *nbl, + gmx_bool bDiagRemoved, + const nbnxn_sci_t *nbl_sci, + real shx, + real shy, + real shz, + real rlist_fep2, + const nbnxn_grid_t *gridi, + const nbnxn_grid_t *gridj, + t_nblist *nlist) +{ + int sci, cj4_ind_start, cj4_ind_end, cj4_ind, gcj, cjr; + int nri_max; + int c, c_abs; + int i, j, ind_i, ind_j, ai, aj; + int nri; + gmx_bool bFEP_i; + real xi, yi, zi; + const nbnxn_cj4_t *cj4; + + if (nbl_sci->cj4_ind_end == nbl_sci->cj4_ind_start) + { + /* Empty list */ + return; + } + + sci = nbl_sci->sci; + + cj4_ind_start = nbl_sci->cj4_ind_start; + cj4_ind_end = nbl_sci->cj4_ind_end; + + /* No energy groups (yet), so we split lists in max_nrj_fep pairs */ + nri_max = nbl->na_sc*(1 + ((cj4_ind_end - cj4_ind_start)*NBNXN_GPU_JGROUP_SIZE)/max_nrj_fep); + if (nlist->nri + nri_max > nlist->maxnri) + { + nlist->maxnri = over_alloc_large(nlist->nri + nri_max); + reallocate_nblist(nlist); + } + + /* Loop over the atoms in the i super-cluster */ + for (c = 0; c < GPU_NSUBCELL; c++) + { + c_abs = sci*GPU_NSUBCELL + c; + + for (i = 0; i < nbl->na_ci; i++) + { + ind_i = c_abs*nbl->na_ci + i; + ai = nbs->a[ind_i]; + if (ai >= 0) + { + nri = nlist->nri; + nlist->jindex[nri+1] = nlist->jindex[nri]; + nlist->iinr[nri] = ai; + /* With GPUs, energy groups are not supported */ + nlist->gid[nri] = 0; + nlist->shift[nri] = nbl_sci->shift & NBNXN_CI_SHIFT; + + bFEP_i = (gridi->fep[c_abs - gridi->cell0] & (1 << i)); + + xi = nbat->x[ind_i*nbat->xstride+XX] + shx; + yi = nbat->x[ind_i*nbat->xstride+YY] + shy; + zi = nbat->x[ind_i*nbat->xstride+ZZ] + shz; + + if ((nlist->nrj + cj4_ind_end - cj4_ind_start)*NBNXN_GPU_JGROUP_SIZE*nbl->na_cj > nlist->maxnrj) + { + nlist->maxnrj = over_alloc_small((nlist->nrj + cj4_ind_end - cj4_ind_start)*NBNXN_GPU_JGROUP_SIZE*nbl->na_cj); + srenew(nlist->jjnr, nlist->maxnrj); + srenew(nlist->excl_fep, nlist->maxnrj); + } + + for (cj4_ind = cj4_ind_start; cj4_ind < cj4_ind_end; cj4_ind++) + { + cj4 = &nbl->cj4[cj4_ind]; + + for (gcj = 0; gcj < NBNXN_GPU_JGROUP_SIZE; gcj++) + { + unsigned int fep_cj; + + if ((cj4->imei[0].imask & (1U << (gcj*GPU_NSUBCELL + c))) == 0) + { + /* Skip this ci for this cj */ + continue; + } + + cjr = cj4->cj[gcj] - gridj->cell0*GPU_NSUBCELL; + + fep_cj = gridj->fep[cjr]; + + if (bFEP_i || fep_cj != 0) + { + for (j = 0; j < nbl->na_cj; j++) + { + /* Is this interaction perturbed and not excluded? */ + ind_j = (gridj->cell0*GPU_NSUBCELL + cjr)*nbl->na_cj + j; + aj = nbs->a[ind_j]; + if (aj >= 0 && + (bFEP_i || (fep_cj & (1 << j))) && + (!bDiagRemoved || ind_j >= ind_i)) + { + nbnxn_excl_t *excl; + int excl_pair; + unsigned int excl_bit; + real dx, dy, dz; + + get_nbl_exclusions_1(nbl, cj4_ind, j>>2, &excl); + + excl_pair = a_mod_wj(j)*nbl->na_ci + i; + excl_bit = (1U << (gcj*GPU_NSUBCELL + c)); + + dx = nbat->x[ind_j*nbat->xstride+XX] - xi; + dy = nbat->x[ind_j*nbat->xstride+YY] - yi; + dz = nbat->x[ind_j*nbat->xstride+ZZ] - zi; + + /* The unpruned GPU list has more than 2/3 + * of the atom pairs beyond rlist. Using + * this list will cause a lot of overhead + * in the CPU FEP kernels, especially + * relative to the fast GPU kernels. + * So we prune the FEP list here. + */ + if (dx*dx + dy*dy + dz*dz < rlist_fep2) + { + if (nlist->nrj - nlist->jindex[nri] >= max_nrj_fep) + { + fep_list_new_nri_copy(nlist); + nri = nlist->nri; + } + + /* Add it to the FEP list */ + nlist->jjnr[nlist->nrj] = aj; + nlist->excl_fep[nlist->nrj] = (excl->pair[excl_pair] & excl_bit) ? 1 : 0; + nlist->nrj++; + } + + /* Exclude it from the normal list. + * Note that the charge and LJ parameters have + * been set to zero, but we need to avoid 0/0, + * as perturbed atoms can be on top of each other. + */ + excl->pair[excl_pair] &= ~excl_bit; + } + } + + /* Note that we could mask out this pair in imask + * if all i- and/or all j-particles are perturbed. + * But since the perturbed pairs on the CPU will + * take an order of magnitude more time, the GPU + * will finish before the CPU and there is no gain. + */ + } + } + } + + if (nlist->nrj > nlist->jindex[nri]) + { + nlist->nri++; + nlist->jindex[nlist->nri] = nlist->nrj; + } + } + } + } +} + /* Set all atom-pair exclusions from the topology stored in excl * as masks in the pair-list for i-super-cell entry nbl_sci */ @@ -3316,26 +3766,15 @@ static void set_sci_top_excls(const nbnxn_search_t nbs, inner_i = i - si*na_c; inner_e = ge - se*na_c; -/* Macro for getting the index of atom a within a cluster */ -#define AMODCJ4(a) ((a) & (NBNXN_GPU_JGROUP_SIZE - 1)) -/* Macro for converting an atom number to a cluster number */ -#define A2CJ4(a) ((a) >> NBNXN_GPU_JGROUP_SIZE_2LOG) -/* Macro for getting the index of an i-atom within a warp */ -#define AMODWI(a) ((a) & (NBNXN_GPU_CLUSTER_SIZE/2 - 1)) - - if (nbl_imask0(nbl, found) & (1U << (AMODCJ4(found)*GPU_NSUBCELL + si))) + if (nbl_imask0(nbl, found) & (1U << (cj_mod_cj4(found)*GPU_NSUBCELL + si))) { w = (inner_e >> 2); - get_nbl_exclusions_1(nbl, A2CJ4(found), w, &nbl_excl); + get_nbl_exclusions_1(nbl, cj_to_cj4(found), w, &nbl_excl); - nbl_excl->pair[AMODWI(inner_e)*nbl->na_ci+inner_i] &= - ~(1U << (AMODCJ4(found)*GPU_NSUBCELL + si)); + nbl_excl->pair[a_mod_wj(inner_e)*nbl->na_ci+inner_i] &= + ~(1U << (cj_mod_cj4(found)*GPU_NSUBCELL + si)); } - -#undef AMODCJ4 -#undef A2CJ4 -#undef AMODWI } } } @@ -3615,6 +4054,18 @@ static void clear_pairlist(nbnxn_pairlist_t *nbl) nbl->work->ncj_hlj = 0; } +/* Clears a group scheme pair list */ +static void clear_pairlist_fep(t_nblist *nl) +{ + nl->nri = 0; + nl->nrj = 0; + if (nl->jindex == NULL) + { + snew(nl->jindex, 1); + } + nl->jindex[0] = 0; +} + /* Sets a simple list i-cell bounding box, including PBC shift */ static gmx_inline void set_icell_bb_simple(const nbnxn_bb_t *bb, int ci, real shx, real shy, real shz, @@ -3693,7 +4144,7 @@ static void icell_set_x_supersub(int ci, int stride, const real *x, nbnxn_list_work_t *work) { - int ia, i; + int ia, i; real *x_ci; x_ci = work->x_ci; @@ -3715,7 +4166,7 @@ static void icell_set_x_supersub_simd4(int ci, int stride, const real *x, nbnxn_list_work_t *work) { - int si, io, ia, i, j; + int si, io, ia, i, j; real *x_ci; x_ci = work->x_ci; @@ -3737,6 +4188,44 @@ static void icell_set_x_supersub_simd4(int ci, } #endif +static real minimum_subgrid_size_xy(const nbnxn_grid_t *grid) +{ + if (grid->bSimple) + { + return min(grid->sx, grid->sy); + } + else + { + return min(grid->sx/GPU_NSUBCELL_X, grid->sy/GPU_NSUBCELL_Y); + } +} + +static real effective_buffer_1x1_vs_MxN(const nbnxn_grid_t *gridi, + const nbnxn_grid_t *gridj) +{ + const real eff_1x1_buffer_fac_overest = 0.1; + + /* Determine an atom-pair list cut-off buffer size for atom pairs, + * to be added to rlist (including buffer) used for MxN. + * This is for converting an MxN list to a 1x1 list. This means we can't + * use the normal buffer estimate, as we have an MxN list in which + * some atom pairs beyond rlist are missing. We want to capture + * the beneficial effect of buffering by extra pairs just outside rlist, + * while removing the useless pairs that are further away from rlist. + * (Also the buffer could have been set manually not using the estimate.) + * This buffer size is an overestimate. + * We add 10% of the smallest grid sub-cell dimensions. + * Note that the z-size differs per cell and we don't use this, + * so we overestimate. + * With PME, the 10% value gives a buffer that is somewhat larger + * than the effective buffer with a tolerance of 0.005 kJ/mol/ps. + * Smaller tolerances or using RF lead to a smaller effective buffer, + * so 10% gives a safe overestimate. + */ + return eff_1x1_buffer_fac_overest*(minimum_subgrid_size_xy(gridi) + + minimum_subgrid_size_xy(gridj)); +} + /* Clusters at the cut-off only increase rlist by 60% of their size */ static real nbnxn_rlist_inc_outside_fac = 0.6; @@ -3814,9 +4303,9 @@ static int get_nsubpair_max(const nbnxn_search_t nbs, int min_ci_balanced) { const nbnxn_grid_t *grid; - rvec ls; - real xy_diag2, r_eff_sup, vol_est, nsp_est, nsp_est_nl; - int nsubpair_max; + rvec ls; + real xy_diag2, r_eff_sup, vol_est, nsp_est, nsp_est_nl; + int nsubpair_max; grid = &nbs->grid[0]; @@ -3998,11 +4487,11 @@ static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl, #pragma omp parallel for num_threads(gmx_omp_nthreads_get(emntPairsearch)) schedule(static) for (n = 0; n < nnbl; n++) { - int sci_offset; - int cj4_offset; - int ci_offset; - int excl_offset; - int i, j4; + int sci_offset; + int cj4_offset; + int ci_offset; + int excl_offset; + int i, j4; const nbnxn_pairlist_t *nbli; /* Determine the offset in the combined data for our thread */ @@ -4050,6 +4539,121 @@ static void combine_nblists(int nnbl, nbnxn_pairlist_t **nbl, } } +static void balance_fep_lists(const nbnxn_search_t nbs, + nbnxn_pairlist_set_t *nbl_lists) +{ + int nnbl, th; + int nri_tot, nrj_tot, nrj_target; + int th_dest; + t_nblist *nbld; + + nnbl = nbl_lists->nnbl; + + if (nnbl == 1) + { + /* Nothing to balance */ + return; + } + + /* Count the total i-lists and pairs */ + nri_tot = 0; + nrj_tot = 0; + for (th = 0; th < nnbl; th++) + { + nri_tot += nbl_lists->nbl_fep[th]->nri; + nrj_tot += nbl_lists->nbl_fep[th]->nrj; + } + + nrj_target = (nrj_tot + nnbl - 1)/nnbl; + + assert(gmx_omp_nthreads_get(emntNonbonded) == nnbl); + +#pragma omp parallel for schedule(static) num_threads(nnbl) + for (th = 0; th < nnbl; th++) + { + t_nblist *nbl; + + nbl = nbs->work[th].nbl_fep; + + /* Note that here we allocate for the total size, instead of + * a per-thread esimate (which is hard to obtain). + */ + if (nri_tot > nbl->maxnri) + { + nbl->maxnri = over_alloc_large(nri_tot); + reallocate_nblist(nbl); + } + if (nri_tot > nbl->maxnri || nrj_tot > nbl->maxnrj) + { + nbl->maxnrj = over_alloc_small(nrj_tot); + srenew(nbl->jjnr, nbl->maxnrj); + srenew(nbl->excl_fep, nbl->maxnrj); + } + + clear_pairlist_fep(nbl); + } + + /* Loop over the source lists and assign and copy i-entries */ + th_dest = 0; + nbld = nbs->work[th_dest].nbl_fep; + for (th = 0; th < nnbl; th++) + { + t_nblist *nbls; + int i, j; + + nbls = nbl_lists->nbl_fep[th]; + + for (i = 0; i < nbls->nri; i++) + { + int nrj; + + /* The number of pairs in this i-entry */ + nrj = nbls->jindex[i+1] - nbls->jindex[i]; + + /* Decide if list th_dest is too large and we should procede + * to the next destination list. + */ + if (th_dest+1 < nnbl && nbld->nrj > 0 && + nbld->nrj + nrj - nrj_target > nrj_target - nbld->nrj) + { + th_dest++; + nbld = nbs->work[th_dest].nbl_fep; + } + + nbld->iinr[nbld->nri] = nbls->iinr[i]; + nbld->gid[nbld->nri] = nbls->gid[i]; + nbld->shift[nbld->nri] = nbls->shift[i]; + + for (j = nbls->jindex[i]; j < nbls->jindex[i+1]; j++) + { + nbld->jjnr[nbld->nrj] = nbls->jjnr[j]; + nbld->excl_fep[nbld->nrj] = nbls->excl_fep[j]; + nbld->nrj++; + } + nbld->nri++; + nbld->jindex[nbld->nri] = nbld->nrj; + } + } + + /* Swap the list pointers */ + for (th = 0; th < nnbl; th++) + { + t_nblist *nbl_tmp; + + nbl_tmp = nbl_lists->nbl_fep[th]; + nbl_lists->nbl_fep[th] = nbs->work[th].nbl_fep; + nbs->work[th].nbl_fep = nbl_tmp; + + if (debug) + { + fprintf(debug, "nbl_fep[%d] nri %4d nrj %4d\n", + th, + nbl_lists->nbl_fep[th]->nri, + nbl_lists->nbl_fep[th]->nrj); + } + } +} + /* Returns the next ci to be processes by our thread */ static gmx_bool next_ci(const nbnxn_grid_t *grid, int conv, @@ -4132,7 +4736,7 @@ static int get_ci_block_size(const nbnxn_grid_t *gridi, const int ci_block_enum = 5; const int ci_block_denom = 11; const int ci_block_min_atoms = 16; - int ci_block; + int ci_block; /* Here we decide how to distribute the blocks over the threads. * We use prime numbers to try to avoid that the grid size becomes @@ -4179,37 +4783,38 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs, gmx_bool progBal, int min_ci_balanced, int th, int nth, - nbnxn_pairlist_t *nbl) + nbnxn_pairlist_t *nbl, + t_nblist *nbl_fep) { - int na_cj_2log; - matrix box; - real rl2; - float rbb2; - int d; - int ci_b, ci, ci_x, ci_y, ci_xy, cj; - ivec shp; - int tx, ty, tz; - int shift; - gmx_bool bMakeList; - real shx, shy, shz; - int conv_i, cell0_i; + int na_cj_2log; + matrix box; + real rl2, rl_fep2 = 0; + float rbb2; + int d; + int ci_b, ci, ci_x, ci_y, ci_xy, cj; + ivec shp; + int tx, ty, tz; + int shift; + gmx_bool bMakeList; + real shx, shy, shz; + int conv_i, cell0_i; const nbnxn_bb_t *bb_i = NULL; #ifdef NBNXN_BBXXXX - const float *pbb_i = NULL; + const float *pbb_i = NULL; #endif - const float *bbcz_i, *bbcz_j; - const int *flags_i; - real bx0, bx1, by0, by1, bz0, bz1; - real bz1_frac; - real d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy; - int cxf, cxl, cyf, cyf_x, cyl; - int cx, cy; - int c0, c1, cs, cf, cl; - int ndistc; - int ncpcheck; - int gridi_flag_shift = 0, gridj_flag_shift = 0; - unsigned *gridj_flag = NULL; - int ncj_old_i, ncj_old_j; + const float *bbcz_i, *bbcz_j; + const int *flags_i; + real bx0, bx1, by0, by1, bz0, bz1; + real bz1_frac; + real d2cx, d2z, d2z_cx, d2z_cy, d2zx, d2zxy, d2xy; + int cxf, cxl, cyf, cyf_x, cyl; + int cx, cy; + int c0, c1, cs, cf, cl; + int ndistc; + int ncpcheck; + int gridi_flag_shift = 0, gridj_flag_shift = 0; + unsigned int *gridj_flag = NULL; + int ncj_old_i, ncj_old_j; nbs_cycle_start(&work->cc[enbsCCsearch]); @@ -4247,6 +4852,23 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs, rl2 = nbl->rlist*nbl->rlist; + if (nbs->bFEP && !nbl->bSimple) + { + /* Determine an atom-pair list cut-off distance for FEP atom pairs. + * We should not simply use rlist, since then we would not have + * the small, effective buffering of the NxN lists. + * The buffer is on overestimate, but the resulting cost for pairs + * beyond rlist is neglible compared to the FEP pairs within rlist. + */ + rl_fep2 = nbl->rlist + effective_buffer_1x1_vs_MxN(gridi, gridj); + + if (debug) + { + fprintf(debug, "nbl_fep atom-pair rlist %f\n", rl_fep2); + } + rl_fep2 = rl_fep2*rl_fep2; + } + rbb2 = boundingbox_only_distance2(gridi, gridj, nbl->rlist, nbl->bSimple); if (debug) @@ -4715,6 +5337,14 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs, na_cj_2log, &(nbl->ci[nbl->nci]), excl); + + if (nbs->bFEP) + { + make_fep_list(nbs, nbat, nbl, + shift == CENTRAL && gridi == gridj, + &(nbl->ci[nbl->nci]), + gridi, gridj, nbl_fep); + } } else { @@ -4724,6 +5354,16 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs, gridj->na_c_2log, &(nbl->sci[nbl->nsci]), excl); + + if (nbs->bFEP) + { + make_fep_list_supersub(nbs, nbat, nbl, + shift == CENTRAL && gridi == gridj, + &(nbl->sci[nbl->nsci]), + shx, shy, shz, + rl_fep2, + gridi, gridj, nbl_fep); + } } /* Close this ci list */ @@ -4767,6 +5407,10 @@ static void nbnxn_make_pairlist_part(const nbnxn_search_t nbs, print_nblist_statistics_supersub(debug, nbl, nbs, rlist); } + if (nbs->bFEP) + { + fprintf(debug, "nbl FEP list pairs: %d\n", nbl_fep->nrj); + } } } @@ -4774,8 +5418,8 @@ static void reduce_buffer_flags(const nbnxn_search_t nbs, int nsrc, const nbnxn_buffer_flags_t *dest) { - int s, b; - const unsigned *flag; + int s, b; + const unsigned int *flag; for (s = 0; s < nsrc; s++) { @@ -4920,17 +5564,17 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs, int nb_kernel_type, t_nrnb *nrnb) { - nbnxn_grid_t *gridi, *gridj; - gmx_bool bGPUCPU; - int nzi, zi, zj0, zj1, zj; - int nsubpair_max; - int th; - int nnbl; + nbnxn_grid_t *gridi, *gridj; + gmx_bool bGPUCPU; + int nzi, zi, zj0, zj1, zj; + int nsubpair_max; + int th; + int nnbl; nbnxn_pairlist_t **nbl; - int ci_block; - gmx_bool CombineNBLists; - gmx_bool progBal; - int np_tot, np_noq, np_hlj, nap; + int ci_block; + gmx_bool CombineNBLists; + gmx_bool progBal; + int np_tot, np_noq, np_hlj, nap; /* Check if we are running hybrid GPU + CPU nbnxn mode */ bGPUCPU = (!nbs->grid[0].bSimple && nbl_list->bSimple); @@ -5004,6 +5648,11 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs, for (th = 0; th < nnbl; th++) { clear_pairlist(nbl[th]); + + if (nbs->bFEP) + { + clear_pairlist_fep(nbl_list->nbl_fep[th]); + } } for (zi = 0; zi < nzi; zi++) @@ -5072,7 +5721,8 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs, nsubpair_max, progBal, min_ci_balanced, th, nnbl, - nbl[th]); + nbl[th], + nbl_list->nbl_fep[th]); } nbs_cycle_stop(&nbs->cc[enbsCCsearch]); @@ -5133,6 +5783,12 @@ void nbnxn_make_pairlist(const nbnxn_search_t nbs, reduce_buffer_flags(nbs, nnbl, &nbat->buffer_flags); } + if (nbs->bFEP) + { + /* Balance the free-energy lists over all the threads */ + balance_fep_lists(nbs, nbl_list); + } + /* Special performance logging stuff (env.var. GMX_NBNXN_CYCLE) */ if (LOCAL_I(iloc)) { diff --git a/src/gromacs/mdlib/nbnxn_search.h b/src/gromacs/mdlib/nbnxn_search.h index ebb142cf27..12576a096c 100644 --- a/src/gromacs/mdlib/nbnxn_search.h +++ b/src/gromacs/mdlib/nbnxn_search.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -59,6 +59,7 @@ real nbnxn_get_rlist_effective_inc(int cluster_size, real atom_density); void nbnxn_init_search(nbnxn_search_t * nbs_ptr, ivec *n_dd_cells, gmx_domdec_zones_t *zones, + gmx_bool bFEP, int nthread_max); /* Put the atoms on the pair search grid. @@ -117,6 +118,7 @@ void nbnxn_init_pairlist_set(nbnxn_pairlist_set_t *nbl_list, * number or roughly equally sized ci blocks in nbl. * When set >0 ci lists will be chopped up when the estimate * for the number of equally sized lists is below min_ci_balanced. + * With perturbed particles, also a group scheme style nbl_fep list is made. */ void nbnxn_make_pairlist(const nbnxn_search_t nbs, nbnxn_atomdata_t *nbat, diff --git a/src/gromacs/mdlib/ns.c b/src/gromacs/mdlib/ns.c index 6cfb7d2936..ff66fe8471 100644 --- a/src/gromacs/mdlib/ns.c +++ b/src/gromacs/mdlib/ns.c @@ -104,7 +104,7 @@ round_up_to_simd_width(int length, int simd_width) * ************************************************/ -static void reallocate_nblist(t_nblist *nl) +void reallocate_nblist(t_nblist *nl) { if (gmx_debug_at) { @@ -169,13 +169,14 @@ static void init_nblist(FILE *log, t_nblist *nl_sr, t_nblist *nl_lr, */ nl->maxnri = homenr*4; nl->maxnrj = 0; - nl->maxlen = 0; nl->nri = -1; nl->nrj = 0; nl->iinr = NULL; nl->gid = NULL; nl->shift = NULL; nl->jindex = NULL; + nl->jjnr = NULL; + nl->excl_fep = NULL; reallocate_nblist(nl); nl->jindex[0] = 0; @@ -332,7 +333,6 @@ static void reset_nblist(t_nblist *nl) { nl->nri = -1; nl->nrj = 0; - nl->maxlen = 0; if (nl->jindex) { nl->jindex[0] = 0; @@ -434,14 +434,6 @@ static inline void close_i_nblist(t_nblist *nlist) nlist->jindex[nri+1] = nlist->nrj; len = nlist->nrj - nlist->jindex[nri]; - - /* nlist length for water i molecules is treated statically - * in the innerloops - */ - if (len > nlist->maxlen) - { - nlist->maxlen = len; - } } } diff --git a/src/gromacs/mdlib/sim_util.c b/src/gromacs/mdlib/sim_util.c index e154f2eba2..52ab9568fd 100644 --- a/src/gromacs/mdlib/sim_util.c +++ b/src/gromacs/mdlib/sim_util.c @@ -43,6 +43,8 @@ #include #endif #include +#include + #include "typedefs.h" #include "string2.h" #include "smalloc.h" @@ -77,6 +79,9 @@ #include "nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h" #include "nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h" #include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h" +#include "nonbonded.h" +#include "../gmxlib/nonbonded/nb_kernel.h" +#include "../gmxlib/nonbonded/nb_free_energy.h" #include "gromacs/timing/wallcycle.h" #include "gromacs/timing/walltime_accounting.h" @@ -88,6 +93,8 @@ #include "adress.h" #include "qmmm.h" +#include "gmx_omp_nthreads.h" + #include "nbnxn_cuda_data_mgmt.h" #include "nbnxn_cuda/nbnxn_cuda.h" @@ -686,6 +693,114 @@ static void do_nb_verlet(t_forcerec *fr, } } +static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists, + t_forcerec *fr, + rvec x[], + rvec f[], + t_mdatoms *mdatoms, + t_lambda *fepvals, + real *lambda, + gmx_enerdata_t *enerd, + int flags, + t_nrnb *nrnb, + gmx_wallcycle_t wcycle) +{ + int donb_flags; + nb_kernel_data_t kernel_data; + real lam_i[efptNR]; + real dvdl_nb[efptNR]; + int th; + int i, j; + + donb_flags = 0; + /* Add short-range interactions */ + donb_flags |= GMX_NONBONDED_DO_SR; + + /* Currently all group scheme kernels always calculate (shift-)forces */ + if (flags & GMX_FORCE_FORCES) + { + donb_flags |= GMX_NONBONDED_DO_FORCE; + } + if (flags & GMX_FORCE_VIRIAL) + { + donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE; + } + if (flags & GMX_FORCE_ENERGY) + { + donb_flags |= GMX_NONBONDED_DO_POTENTIAL; + } + if (flags & GMX_FORCE_DO_LR) + { + donb_flags |= GMX_NONBONDED_DO_LR; + } + + kernel_data.flags = donb_flags; + kernel_data.lambda = lambda; + kernel_data.dvdl = dvdl_nb; + + kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR]; + kernel_data.energygrp_vdw = enerd->grpp.ener[egLJSR]; + + /* reset free energy components */ + for (i = 0; i < efptNR; i++) + { + dvdl_nb[i] = 0; + } + + assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl); + + wallcycle_sub_start(wcycle, ewcsNONBONDED); +#pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl) + for (th = 0; th < nbl_lists->nnbl; th++) + { + gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th], + x, f, fr, mdatoms, &kernel_data, nrnb); + } + + if (fepvals->sc_alpha != 0) + { + enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW]; + enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL]; + } + else + { + enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW]; + enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL]; + } + + /* If we do foreign lambda and we have soft-core interactions + * we have to recalculate the (non-linear) energies contributions. + */ + if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0) + { + kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA; + kernel_data.lambda = lam_i; + kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR]; + kernel_data.energygrp_vdw = enerd->foreign_grpp.ener[egLJSR]; + /* Note that we add to kernel_data.dvdl, but ignore the result */ + + for (i = 0; i < enerd->n_lambda; i++) + { + for (j = 0; j < efptNR; j++) + { + lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]); + } + reset_foreign_enerdata(enerd); +#pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl) + for (th = 0; th < nbl_lists->nnbl; th++) + { + gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th], + x, f, fr, mdatoms, &kernel_data, nrnb); + } + + sum_epot(&(enerd->foreign_grpp), enerd->foreign_term); + enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT]; + } + } + + wallcycle_sub_stop(wcycle, ewcsNONBONDED); +} + void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, t_inputrec *inputrec, gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle, @@ -1152,6 +1267,29 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr, nrnb, wcycle); } + if (fr->efep != efepNO) + { + /* Calculate the local and non-local free energy interactions here. + * Happens here on the CPU both with and without GPU. + */ + if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0) + { + do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists, + fr, x, f, mdatoms, + inputrec->fepvals, lambda, + enerd, flags, nrnb, wcycle); + } + + if (DOMAINDECOMP(cr) && + fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0) + { + do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists, + fr, x, f, mdatoms, + inputrec->fepvals, lambda, + enerd, flags, nrnb, wcycle); + } + } + if (!bUseOrEmulGPU || bDiffKernels) { int aloc; diff --git a/src/programs/mdrun/md.c b/src/programs/mdrun/md.c index 3ae70f5038..f5c151202c 100644 --- a/src/programs/mdrun/md.c +++ b/src/programs/mdrun/md.c @@ -458,13 +458,10 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[], repl_ex_nst, repl_ex_nex, repl_ex_seed); } - /* PME tuning is only supported with GPUs or PME nodes and not with rerun or LJ-PME. - * With perturbed charges with soft-core we should not change the cut-off. - */ + /* PME tuning is only supported with GPUs or PME nodes and not with rerun or LJ-PME. */ if ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) && - !(ir->efep != efepNO && mdatoms->nChargePerturbed > 0 && ir->fepvals->bScCoul) && !bRerunMD && !EVDW_PME(fr->vdwtype)) { pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata); diff --git a/src/programs/mdrun/pme_loadbal.c b/src/programs/mdrun/pme_loadbal.c index 4606f735cc..5ea760ec3a 100644 --- a/src/programs/mdrun/pme_loadbal.c +++ b/src/programs/mdrun/pme_loadbal.c @@ -692,17 +692,12 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb, } #endif /* GMX_THREAD_MPI */ } - else - { - init_interaction_const_tables(NULL, ic, bUsesSimpleTables, - rtab); - } - if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->ngrp > 1) - { - init_interaction_const_tables(NULL, ic, bUsesSimpleTables, - rtab); - } + /* Usually we won't need the simple tables with GPUs. + * But we do with hybrid acceleration and with free energy. + * To avoid bugs, we always re-initialize the simple tables here. + */ + init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab); if (cr->duty & DUTY_PME) { diff --git a/src/programs/mdrun/runner.c b/src/programs/mdrun/runner.c index 767d0bb875..96d4f2f06c 100644 --- a/src/programs/mdrun/runner.c +++ b/src/programs/mdrun/runner.c @@ -1598,11 +1598,6 @@ int mdrunner(gmx_hw_opt_t *hw_opt, */ mdatoms = init_mdatoms(fplog, mtop, inputrec->efep != efepNO); - if (mdatoms->nPerturbed > 0 && inputrec->cutoff_scheme == ecutsVERLET) - { - gmx_fatal(FARGS, "The Verlet cut-off scheme does not (yet) support free-energy calculations with perturbed atoms, only perturbed interactions. This will be implemented soon. Use the group scheme for now."); - } - /* Initialize the virtual site communication */ vsite = init_vsite(mtop, cr, FALSE);