From a6ae71b20228b41783c7b55fbb86831466d68b1c Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Tue, 3 Dec 2013 09:54:35 +0100 Subject: [PATCH] Added LJ-PME and LJ Pot-switch to PME load estimate Change-Id: I7f032fffb0a26d40914740c1180c4137c0b95653 --- src/gromacs/mdlib/perf_est.c | 134 +++++++++++++++++++++++++---------- 1 file changed, 95 insertions(+), 39 deletions(-) diff --git a/src/gromacs/mdlib/perf_est.c b/src/gromacs/mdlib/perf_est.c index 33c6d010e6..def9c5b310 100644 --- a/src/gromacs/mdlib/perf_est.c +++ b/src/gromacs/mdlib/perf_est.c @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2008, The GROMACS development team. - * Copyright (c) 2012, by the GROMACS development team, led by + * Copyright (c) 2012,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -56,24 +56,26 @@ * although not so much that the numbers need to be adjusted. */ -/* Cost of a pair interaction in the "group" cut-off scheme" */ -#define C_GR_FQ 1.5 -#define C_GR_QLJ_CUT 1.5 -#define C_GR_QLJ_TAB 2.0 -#define C_GR_LJ_CUT 1.0 -#define C_GR_LJ_TAB 1.75 +/* Cost of a pair interaction in the "group" cut-off scheme */ +#define C_GR_FQ 1.5 +#define C_GR_QLJ_CUT 1.5 +#define C_GR_QLJ_TAB 2.0 +#define C_GR_LJ_CUT 1.0 +#define C_GR_LJ_TAB 1.75 /* Cost of 1 water with one Q/LJ atom */ -#define C_GR_QLJW_CUT 2.0 -#define C_GR_QLJW_TAB 2.25 +#define C_GR_QLJW_CUT 2.0 +#define C_GR_QLJW_TAB 2.25 /* Cost of 1 water with one Q atom or with 1/3 water (LJ negligible) */ -#define C_GR_QW 1.75 +#define C_GR_QW 1.75 -/* Cost of a pair interaction in the "Verlet" cut-off scheme" */ -#define C_VT_LJ 0.30 -#define C_VT_QLJ_RF 0.40 -#define C_VT_Q_RF 0.30 -#define C_VT_QLJ_TAB 0.55 -#define C_VT_Q_TAB 0.50 +/* Cost of a pair interaction in the "Verlet" cut-off scheme, QEXP is Ewald */ +#define C_VT_LJ 0.30 +#define C_VT_QRF_LJ 0.40 +#define C_VT_QRF 0.30 +#define C_VT_QEXP_LJ 0.55 +#define C_VT_QEXP 0.50 +/* Extra cost for expensive LJ interaction, e.g. pot-switch or LJ-PME */ +#define C_VT_LJEXP_ADD 0.20 /* Cost of PME, with all components running with SSE instructions */ /* Cost of particle reordering and redistribution */ @@ -138,9 +140,9 @@ int n_bonded_dx(gmx_mtop_t *mtop, gmx_bool bExcl) } static void pp_group_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box, - int *nq_tot, + int *nq_tot, int *nlj_tot, double *cost_pp, - gmx_bool *bChargePerturbed) + gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed) { t_atom *atom; int mb, nmol, atnr, cg, a, a0, ncqlj, ncq, nclj; @@ -197,6 +199,10 @@ static void pp_group_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box, { *bChargePerturbed = TRUE; } + if (atom[a].type != atom[a].typeB) + { + *bTypePerturbed = TRUE; + } /* This if this atom fits into water optimization */ if (!((a == a0 && bQ && bLJ) || (a == a0+1 && bQ && !bLJ) || @@ -235,7 +241,8 @@ static void pp_group_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box, } } - *nq_tot = nq + nqlj + nw*3; + *nq_tot = nq + nqlj + nw*3; + *nlj_tot = nlj + nqlj + nw; if (debug) { @@ -256,17 +263,26 @@ static void pp_group_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box, } static void pp_verlet_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box, - int *nq_tot, + int *nq_tot, int *nlj_tot, double *cost_pp, - gmx_bool *bChargePerturbed) + gmx_bool *bChargePerturbed, gmx_bool *bTypePerturbed) { t_atom *atom; int mb, nmol, atnr, cg, a, a0, nqlj, nq, nlj; gmx_bool bQRF; t_iparams *iparams; gmx_moltype_t *molt; - float r_eff; + real r_eff; + double c_qlj, c_q, c_lj; double nat; + /* Conversion factor for reference vs SIMD kernel performance. + * The factor is about right for SSE2/4, but should be 2 higher for AVX256. + */ +#ifdef GMX_DOUBLE + const real nbnxn_refkernel_fac = 4.0; +#else + const real nbnxn_refkernel_fac = 8.0; +#endif bQRF = (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT); @@ -299,12 +315,17 @@ static void pp_verlet_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box, { *bChargePerturbed = TRUE; } + if (atom[a].type != atom[a].typeB) + { + *bTypePerturbed = TRUE; + } } } nlj = mtop->natoms - nqlj - nq; - *nq_tot = nqlj + nq; + *nq_tot = nqlj + nq; + *nlj_tot = nqlj + nlj; /* Effective cut-off for cluster pair list of 4x4 atoms */ r_eff = ir->rlist + nbnxn_get_rlist_effective_inc(NBNXN_CPU_CLUSTER_I_SIZE, mtop->natoms/det(box)); @@ -315,22 +336,38 @@ static void pp_verlet_load(gmx_mtop_t *mtop, t_inputrec *ir, matrix box, nqlj, nq, nlj, ir->rlist, r_eff); } + /* Determine the cost per pair interaction */ + c_qlj = (bQRF ? C_VT_QRF_LJ : C_VT_QEXP_LJ); + c_q = (bQRF ? C_VT_QRF : C_VT_QEXP); + c_lj = C_VT_LJ; + if (ir->vdw_modifier == eintmodPOTSWITCH || EVDW_PME(ir->vdwtype)) + { + c_qlj += C_VT_LJEXP_ADD; + c_lj += C_VT_LJEXP_ADD; + } + if (EVDW_PME(ir->vdwtype) && ir->ljpme_combination_rule == eljpmeLB) + { + /* We don't have LJ-PME LB comb. rule kernels, we use slow kernels */ + c_qlj *= nbnxn_refkernel_fac; + c_q *= nbnxn_refkernel_fac; + c_lj *= nbnxn_refkernel_fac; + } + /* For the PP non-bonded cost it is (unrealistically) assumed * that all atoms are distributed homogeneously in space. */ /* Convert mtop->natoms to double to avoid int overflow */ nat = mtop->natoms; - *cost_pp = 0.5*(nqlj*nat*(bQRF ? C_VT_QLJ_RF : C_VT_QLJ_TAB) + - nq*nat*(bQRF ? C_VT_Q_RF : C_VT_Q_TAB) + - nlj*nat*C_VT_LJ) + *cost_pp = 0.5*nat*(nqlj*c_qlj + nq*c_q + nlj*c_lj) *4/3*M_PI*r_eff*r_eff*r_eff/det(box); } float pme_load_estimate(gmx_mtop_t *mtop, t_inputrec *ir, matrix box) { t_atom *atom; - int mb, nmol, atnr, cg, a, a0, nq_tot; - gmx_bool bBHAM, bLJcut, bChargePerturbed, bWater, bQ, bLJ; + int mb, nmol, atnr, cg, a, a0, nq_tot, nlj_tot, f; + gmx_bool bBHAM, bLJcut, bChargePerturbed, bTypePerturbed; + gmx_bool bWater, bQ, bLJ; double cost_bond, cost_pp, cost_redist, cost_spread, cost_fft, cost_solve, cost_pme; float ratio; t_iparams *iparams; @@ -350,24 +387,43 @@ float pme_load_estimate(gmx_mtop_t *mtop, t_inputrec *ir, matrix box) if (ir->cutoff_scheme == ecutsGROUP) { - pp_group_load(mtop, ir, box, &nq_tot, &cost_pp, &bChargePerturbed); + pp_group_load(mtop, ir, box, + &nq_tot, &nlj_tot, &cost_pp, + &bChargePerturbed, &bTypePerturbed); } else { - pp_verlet_load(mtop, ir, box, &nq_tot, &cost_pp, &bChargePerturbed); + pp_verlet_load(mtop, ir, box, + &nq_tot, &nlj_tot, &cost_pp, + &bChargePerturbed, &bTypePerturbed); } - cost_redist = C_PME_REDIST*nq_tot; - cost_spread = C_PME_SPREAD*nq_tot*pow(ir->pme_order, 3); - cost_fft = C_PME_FFT*ir->nkx*ir->nky*ir->nkz*log(ir->nkx*ir->nky*ir->nkz); - cost_solve = C_PME_SOLVE*ir->nkx*ir->nky*ir->nkz; + cost_redist = 0; + cost_spread = 0; + cost_fft = 0; + cost_solve = 0; + + if (EEL_PME(ir->coulombtype)) + { + f = ((ir->efep != efepNO && bChargePerturbed) ? 2 : 1); + cost_redist += C_PME_REDIST*nq_tot; + cost_spread += f*C_PME_SPREAD*nq_tot*pow(ir->pme_order, 3); + cost_fft += f*C_PME_FFT*ir->nkx*ir->nky*ir->nkz*log(ir->nkx*ir->nky*ir->nkz); + cost_solve += f*C_PME_SOLVE*ir->nkx*ir->nky*ir->nkz; + } - if (ir->efep != efepNO && bChargePerturbed) + if (EVDW_PME(ir->vdwtype)) { - /* All PME work, except redist & spline coefficient calculation, doubles */ - cost_spread *= 2; - cost_fft *= 2; - cost_solve *= 2; + f = ((ir->efep != efepNO && bTypePerturbed) ? 2 : 1); + if (ir->ljpme_combination_rule == eljpmeLB) + { + /* LB combination rule: we have 7 mesh terms */ + f *= 7; + } + cost_redist += C_PME_REDIST*nlj_tot; + cost_spread += f*C_PME_SPREAD*nlj_tot*pow(ir->pme_order, 3); + cost_fft += f*C_PME_FFT*ir->nkx*ir->nky*ir->nkz*log(ir->nkx*ir->nky*ir->nkz); + cost_solve += f*C_PME_SOLVE*ir->nkx*ir->nky*ir->nkz; } cost_pme = cost_redist + cost_spread + cost_fft + cost_solve; -- 2.22.0