From 132373bd62ba7b1190952717e097d652ba1f68c0 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Mon, 15 Jun 2015 23:56:26 +0200 Subject: [PATCH] GPU Ewald table kernel now uses CPU spacing The tabulated Ewald GPU kernels (only used by default on Fermi), used a fixed table spacing. Now the same adaptive spacing as on the CPU is used. With default settings this leads to a coarser table. Change-Id: I35f0c89e00301ac7e1260b1c0dcc1604300da5aa --- src/gromacs/ewald/pme-load-balancing.cpp | 11 +- src/gromacs/legacyheaders/force.h | 6 +- .../legacyheaders/types/interaction_const.h | 4 +- src/gromacs/mdlib/forcerec.cpp | 33 ++-- src/gromacs/mdlib/nbnxn_consts.h | 4 - .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu | 142 +++++++++--------- 6 files changed, 93 insertions(+), 107 deletions(-) diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp index 2d5ff5e645..fab16ec2b7 100644 --- a/src/gromacs/ewald/pme-load-balancing.cpp +++ b/src/gromacs/ewald/pme-load-balancing.cpp @@ -528,7 +528,6 @@ pme_load_balance(pme_load_balancing_t *pme_lb, double cycles_fast; char buf[STRLEN], sbuf[22]; real rtab; - gmx_bool bUsesSimpleTables = TRUE; if (pme_lb->stage == pme_lb->nstage) { @@ -772,7 +771,9 @@ pme_load_balance(pme_load_balancing_t *pme_lb, } } - bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0); + /* We always re-initialize the tables whether they are used or not */ + init_interaction_const_tables(NULL, ic, rtab); + nbnxn_gpu_pme_loadbal_update_param(nbv, ic); /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore @@ -794,12 +795,6 @@ pme_load_balance(pme_load_balancing_t *pme_lb, } #endif /* GMX_THREAD_MPI */ - /* Usually we won't need the simple tables with GPUs. - * But we do with hybrid acceleration and with free energy. - * To avoid bugs, we always re-initialize the simple tables here. - */ - init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab); - if (!pme_lb->bSepPMERanks) { if (pme_lb->setup[pme_lb->cur].pmedata == NULL) diff --git a/src/gromacs/legacyheaders/force.h b/src/gromacs/legacyheaders/force.h index 4014b5740e..fa5c22cff4 100644 --- a/src/gromacs/legacyheaders/force.h +++ b/src/gromacs/legacyheaders/force.h @@ -155,12 +155,8 @@ gmx_bool uses_simple_tables(int cutoff_scheme, void init_interaction_const_tables(FILE *fp, interaction_const_t *ic, - gmx_bool bSimpleTable, real rtab); -/* Initializes the tables in the interaction constant data structure. - * Setting verlet_kernel_type to -1 always initializes tables for - * use with group kernels. - */ +/* Initializes the tables in the interaction constant data structure. */ void init_forcerec(FILE *fplog, const output_env_t oenv, diff --git a/src/gromacs/legacyheaders/types/interaction_const.h b/src/gromacs/legacyheaders/types/interaction_const.h index 319d6c2a61..e1016d26d1 100644 --- a/src/gromacs/legacyheaders/types/interaction_const.h +++ b/src/gromacs/legacyheaders/types/interaction_const.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014,2015, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -68,6 +68,8 @@ typedef struct { } switch_consts_t; typedef struct { + int cutoff_scheme; + /* VdW */ int vdwtype; int vdw_modifier; diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp index e5becd4205..7cde5b73b2 100644 --- a/src/gromacs/mdlib/forcerec.cpp +++ b/src/gromacs/mdlib/forcerec.cpp @@ -43,6 +43,8 @@ #include #include +#include + #include "gromacs/domdec/domdec.h" #include "gromacs/ewald/ewald.h" #include "gromacs/gmxlib/gpu_utils/gpu_utils.h" @@ -1833,27 +1835,24 @@ gmx_bool uses_simple_tables(int cutoff_scheme, } static void init_ewald_f_table(interaction_const_t *ic, - gmx_bool bUsesSimpleTables, real rtab) { real maxr; - if (bUsesSimpleTables) - { - /* Get the Ewald table spacing based on Coulomb and/or LJ - * Ewald coefficients and rtol. - */ - ic->tabq_scale = ewald_spline3_table_scale(ic); + /* Get the Ewald table spacing based on Coulomb and/or LJ + * Ewald coefficients and rtol. + */ + ic->tabq_scale = ewald_spline3_table_scale(ic); - maxr = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb; - ic->tabq_size = (int)(maxr*ic->tabq_scale) + 2; + if (ic->cutoff_scheme == ecutsVERLET) + { + maxr = ic->rcoulomb; } else { - ic->tabq_size = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE; - /* Subtract 2 iso 1 to avoid access out of range due to rounding */ - ic->tabq_scale = (ic->tabq_size - 2)/ic->rcoulomb; + maxr = std::max(ic->rcoulomb, rtab); } + ic->tabq_size = static_cast(maxr*ic->tabq_scale) + 2; sfree_aligned(ic->tabq_coul_FDV0); sfree_aligned(ic->tabq_coul_F); @@ -1885,12 +1884,11 @@ static void init_ewald_f_table(interaction_const_t *ic, void init_interaction_const_tables(FILE *fp, interaction_const_t *ic, - gmx_bool bUsesSimpleTables, real rtab) { if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype) || EVDW_PME(ic->vdwtype)) { - init_ewald_f_table(ic, bUsesSimpleTables, rtab); + init_ewald_f_table(ic, rtab); if (fp != NULL) { @@ -1957,6 +1955,8 @@ init_interaction_const(FILE *fp, snew(ic, 1); + ic->cutoff_scheme = fr->cutoff_scheme; + /* Just allocate something so we can free it */ snew_aligned(ic->tabq_coul_FDV0, 16, 32); snew_aligned(ic->tabq_coul_F, 16, 32); @@ -3227,10 +3227,9 @@ void init_forcerec(FILE *fp, init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt); } + init_interaction_const_tables(fp, fr->ic, rtab); + initialize_gpu_constants(cr, fr->ic, fr->nbv); - init_interaction_const_tables(fp, fr->ic, - uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1), - rtab); if (ir->eDispCorr != edispcNO) { diff --git a/src/gromacs/mdlib/nbnxn_consts.h b/src/gromacs/mdlib/nbnxn_consts.h index d685a453c0..3c5da82932 100644 --- a/src/gromacs/mdlib/nbnxn_consts.h +++ b/src/gromacs/mdlib/nbnxn_consts.h @@ -78,10 +78,6 @@ extern "C" { #define NBNXN_AVOID_SING_R2_INC 1.0e-36 #endif -/* Coulomb force table size chosen such that it fits along the non-bonded - parameters in the texture cache. */ -#define GPU_EWALD_COULOMB_FORCE_TABLE_SIZE 1536 - /* Strides for x/f with xyz and xyzq coordinate (and charge) storage */ #define STRIDE_XYZ 3 diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index 1cd9b4aa13..3c2b38bcb2 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -52,7 +52,6 @@ #include "gromacs/gmxlib/cuda_tools/pmalloc_cuda.h" #include "gromacs/gmxlib/gpu_utils/gpu_utils.h" #include "gromacs/legacyheaders/gmx_detect_hardware.h" -#include "gromacs/legacyheaders/tables.h" #include "gromacs/legacyheaders/typedefs.h" #include "gromacs/legacyheaders/types/enums.h" #include "gromacs/legacyheaders/types/force_flags.h" @@ -114,74 +113,65 @@ static void md_print_warn(FILE *fplog, /* Fw. decl. */ static void nbnxn_cuda_clear_e_fshift(gmx_nbnxn_cuda_t *nb); +/* Fw. decl, */ +static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam, + const gmx_device_info_t *dev_info); + /*! Tabulates the Ewald Coulomb force and initializes the size/scale and the table GPU array. If called with an already allocated table, it just re-uploads the table. */ -static void init_ewald_coulomb_force_table(cu_nbparam_t *nbp, - const gmx_device_info_t *dev_info) +static void init_ewald_coulomb_force_table(const interaction_const_t *ic, + cu_nbparam_t *nbp, + const gmx_device_info_t *dev_info) { - float *ftmp, *coul_tab; - int tabsize; - double tabscale; + float *coul_tab; cudaError_t stat; - tabsize = GPU_EWALD_COULOMB_FORCE_TABLE_SIZE; - /* Subtract 2 iso 1 to avoid access out of range due to rounding */ - tabscale = (tabsize - 2) / sqrt(nbp->rcoulomb_sq); - - pmalloc((void**)&ftmp, tabsize*sizeof(*ftmp)); - - table_spline3_fill_ewald_lr(ftmp, NULL, NULL, tabsize, - 1/tabscale, nbp->ewald_beta, v_q_ewald_lr); - - /* If the table pointer == NULL the table is generated the first time => - the array pointer will be saved to nbparam and the texture is bound. - */ - coul_tab = nbp->coulomb_tab; - if (coul_tab == NULL) + if (nbp->coulomb_tab != NULL) { - stat = cudaMalloc((void **)&coul_tab, tabsize*sizeof(*coul_tab)); - CU_RET_ERR(stat, "cudaMalloc failed on coul_tab"); + nbnxn_cuda_free_nbparam_table(nbp, dev_info); + } + + stat = cudaMalloc((void **)&coul_tab, ic->tabq_size*sizeof(*coul_tab)); + CU_RET_ERR(stat, "cudaMalloc failed on coul_tab"); - nbp->coulomb_tab = coul_tab; + nbp->coulomb_tab = coul_tab; #ifdef HAVE_CUDA_TEXOBJ_SUPPORT - /* Only device CC >= 3.0 (Kepler and later) support texture objects */ - if (dev_info->prop.major >= 3) - { - cudaResourceDesc rd; - memset(&rd, 0, sizeof(rd)); - rd.resType = cudaResourceTypeLinear; - rd.res.linear.devPtr = nbp->coulomb_tab; - rd.res.linear.desc.f = cudaChannelFormatKindFloat; - rd.res.linear.desc.x = 32; - rd.res.linear.sizeInBytes = tabsize*sizeof(*coul_tab); + /* Only device CC >= 3.0 (Kepler and later) support texture objects */ + if (dev_info->prop.major >= 3) + { + cudaResourceDesc rd; + memset(&rd, 0, sizeof(rd)); + rd.resType = cudaResourceTypeLinear; + rd.res.linear.devPtr = nbp->coulomb_tab; + rd.res.linear.desc.f = cudaChannelFormatKindFloat; + rd.res.linear.desc.x = 32; + rd.res.linear.sizeInBytes = ic->tabq_size*sizeof(*coul_tab); - cudaTextureDesc td; - memset(&td, 0, sizeof(td)); - td.readMode = cudaReadModeElementType; - stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL); - CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed"); - } - else + cudaTextureDesc td; + memset(&td, 0, sizeof(td)); + td.readMode = cudaReadModeElementType; + stat = cudaCreateTextureObject(&nbp->coulomb_tab_texobj, &rd, &td, NULL); + CU_RET_ERR(stat, "cudaCreateTextureObject on coulomb_tab_texobj failed"); + } + else #endif /* HAVE_CUDA_TEXOBJ_SUPPORT */ - { - GMX_UNUSED_VALUE(dev_info); - cudaChannelFormatDesc cd = cudaCreateChannelDesc(); - stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(), - coul_tab, &cd, tabsize*sizeof(*coul_tab)); - CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed"); - } + { + GMX_UNUSED_VALUE(dev_info); + cudaChannelFormatDesc cd = cudaCreateChannelDesc(); + stat = cudaBindTexture(NULL, &nbnxn_cuda_get_coulomb_tab_texref(), + coul_tab, &cd, + ic->tabq_size*sizeof(*coul_tab)); + CU_RET_ERR(stat, "cudaBindTexture on coulomb_tab_texref failed"); } - cu_copy_H2D(coul_tab, ftmp, tabsize*sizeof(*coul_tab)); - - nbp->coulomb_tab_size = tabsize; - nbp->coulomb_tab_scale = tabscale; + cu_copy_H2D(coul_tab, ic->tabq_coul_F, ic->tabq_size*sizeof(*coul_tab)); - pfree(ftmp); + nbp->coulomb_tab_size = ic->tabq_size; + nbp->coulomb_tab_scale = ic->tabq_scale; } @@ -362,7 +352,7 @@ static void init_nbparam(cu_nbparam_t *nbp, nbp->coulomb_tab = NULL; if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN) { - init_ewald_coulomb_force_table(nbp, dev_info); + init_ewald_coulomb_force_table(ic, nbp, dev_info); } nnbfp = 2*ntypes*ntypes; @@ -448,7 +438,7 @@ void nbnxn_gpu_pme_loadbal_update_param(const nonbonded_verlet_t *nbv, nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw, nb->dev_info); - init_ewald_coulomb_force_table(nb->nbparam, nb->dev_info); + init_ewald_coulomb_force_table(ic, nb->nbparam, nb->dev_info); } /*! Initializes the pair list data structure. */ @@ -923,6 +913,31 @@ void nbnxn_gpu_init_atomdata(gmx_nbnxn_cuda_t *nb, } } +static void nbnxn_cuda_free_nbparam_table(cu_nbparam_t *nbparam, + const gmx_device_info_t *dev_info) +{ + cudaError_t stat; + + if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN) + { +#ifdef HAVE_CUDA_TEXOBJ_SUPPORT + /* Only device CC >= 3.0 (Kepler and later) support texture objects */ + if (dev_info->prop.major >= 3) + { + stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj); + CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed"); + } + else +#endif + { + GMX_UNUSED_VALUE(dev_info); + stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref()); + CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed"); + } + cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size); + } +} + void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) { cudaError_t stat; @@ -950,24 +965,7 @@ void nbnxn_gpu_free(gmx_nbnxn_cuda_t *nb) plist_nl = nb->plist[eintNonlocal]; timers = nb->timers; - if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN) - { - -#ifdef HAVE_CUDA_TEXOBJ_SUPPORT - /* Only device CC >= 3.0 (Kepler and later) support texture objects */ - if (nb->dev_info->prop.major >= 3) - { - stat = cudaDestroyTextureObject(nbparam->coulomb_tab_texobj); - CU_RET_ERR(stat, "cudaDestroyTextureObject on coulomb_tab_texobj failed"); - } - else -#endif - { - stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref()); - CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab_texref failed"); - } - cu_free_buffered(nbparam->coulomb_tab, &nbparam->coulomb_tab_size); - } + nbnxn_cuda_free_nbparam_table(nbparam, nb->dev_info); stat = cudaEventDestroy(nb->nonlocal_done); CU_RET_ERR(stat, "cudaEventDestroy failed on timers->nonlocal_done"); -- 2.22.0