From a8cf8c39f389d529696015272a21cf24acc5bc15 Mon Sep 17 00:00:00 2001 From: Szilard Pall Date: Fri, 10 Jan 2014 12:46:15 +0100 Subject: [PATCH] CUDA kernels for switched LJ This change adds CUDA GPU kernels for force and potential switching, implemented in the Verlet scheme for CPUs in f3acaf. Change-Id: I73a83d352924379d9005104fa59f3478d3e684b0 --- src/gromacs/mdlib/forcerec.c | 6 +- src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu | 132 ++++++++++++----- .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu | 27 +++- .../mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh | 47 ++++-- .../nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh | 135 +++++++++++++++++- .../mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh | 127 ++++++++++++++-- .../mdlib/nbnxn_cuda/nbnxn_cuda_types.h | 103 +++++++++---- 7 files changed, 474 insertions(+), 103 deletions(-) diff --git a/src/gromacs/mdlib/forcerec.c b/src/gromacs/mdlib/forcerec.c index 954c6a714a..3784522200 100644 --- a/src/gromacs/mdlib/forcerec.c +++ b/src/gromacs/mdlib/forcerec.c @@ -1545,11 +1545,9 @@ gmx_bool nbnxn_acceleration_supported(FILE *fplog, /* TODO: remove these GPU specific restrictions by implementing CUDA kernels */ if (bGPU) { - if (ir->vdw_modifier == eintmodFORCESWITCH || - ir->vdw_modifier == eintmodPOTSWITCH || - ir->vdwtype == evdwPME) + if (ir->vdwtype == evdwPME) { - md_print_warn(cr, fplog, "LJ switch functions and LJ-PME are not yet supported on the GPU, falling back to CPU only\n"); + md_print_warn(cr, fplog, "LJ-PME is not yet supported with GPUs, falling back to CPU only\n"); return FALSE; } } diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu index 134b43e3a9..7477ac7863 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -143,40 +143,93 @@ static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo) /* Constant arrays listing all kernel function pointers and enabling selection of a kernel in an elegant manner. */ -static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */ -static const int nPruneKernelTypes = 2; /* 0 - no prune, 1 - prune */ - -/*! Pointers to the default kernels organized in a 3 dim array by: - * electrostatics type, energy calculation on/off, and pruning on/off. +/*! Pointers to the non-bonded kernels organized in 2-dim arrays by: + * electrostatics and VDW type. * - * Note that the order of electrostatics (1st dimension) has to match the - * order of corresponding enumerated types defined in nbnxn_cuda_types.h. + * Note that the row- and column-order of function pointers has to match the + * order of corresponding enumerated electrostatics and vdw types, resp., + * defined in nbnxn_cuda_types.h. */ -static const nbnxn_cu_kfunc_ptr_t - nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] = + +/*! Force-only kernel function pointers. */ +static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] = +{ + { nbnxn_kernel_ElecCut_VdwLJ_F_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda }, + { nbnxn_kernel_ElecRF_VdwLJ_F_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda }, + { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda }, + { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda }, + { nbnxn_kernel_ElecEw_VdwLJ_F_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda }, + { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda } +}; + +/*! Force + energy kernel function pointers. */ +static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] = +{ + { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda }, + { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda }, + { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda }, + { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda }, + { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda }, + { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda } +}; + +/*! Force + pruning kernel function pointers. */ +static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] = { - { { nbnxn_kernel_ElecCut_VdwLJ_F_cuda, nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda }, - { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda } }, - { { nbnxn_kernel_ElecRF_VdwLJ_F_cuda, nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda }, - { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda, nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda } }, - { { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda }, - { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda } }, - { { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda }, - { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda } }, - { { nbnxn_kernel_ElecEw_VdwLJ_F_cuda, nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda }, - { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda, nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda } }, - { { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda }, - { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda } } + { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda }, + { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda }, + { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda }, + { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda }, + { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda }, + { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda } +}; + +/*! Force + energy + pruning kernel function pointers. */ +static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] = +{ + { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda }, + { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda }, + { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda }, + { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda }, + { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda }, + { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda } }; /*! Return a pointer to the kernel version to be executed at the current step. */ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int eeltype, + int evdwtype, bool bDoEne, bool bDoPrune) { + nbnxn_cu_kfunc_ptr_t res; + assert(eeltype < eelCuNR); + assert(evdwtype < eelCuNR); - return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune]; + if (bDoEne) + { + if (bDoPrune) + { + res = nb_kfunc_ener_prune_ptr[eeltype][evdwtype]; + } + else + { + res = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype]; + } + } + else + { + if (bDoPrune) + { + res = nb_kfunc_noener_prune_ptr[eeltype][evdwtype]; + } + else + { + res = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype]; + } + } + + return res; } /*! Calculates the amount of shared memory required by the CUDA kernel in use. */ @@ -302,7 +355,9 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t cu_nb, } /* get the pointer to the kernel flavor we need to use */ - nb_kernel = select_nbnxn_kernel(nbp->eeltype, bCalcEner, + nb_kernel = select_nbnxn_kernel(nbp->eeltype, + nbp->vdwtype, + bCalcEner, plist->bDoPrune || always_prune); /* kernel launch config */ @@ -642,23 +697,26 @@ void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo) for (int i = 0; i < eelCuNR; i++) { - for (int j = 0; j < nEnergyKernelTypes; j++) + for (int j = 0; j < evdwCuNR; j++) { - for (int k = 0; k < nPruneKernelTypes; k++) + if (devinfo->prop.major >= 3) + { + /* Default kernel on sm 3.x 48/16 kB Shared/L1 */ + stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared); + stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared); + stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared); + stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared); + } + else { - if (devinfo->prop.major >= 3) - { - /* Default kernel on sm 3.x 48/16 kB Shared/L1 */ - stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared); - } - else - { - /* On Fermi prefer L1 gives 2% higher performance */ - /* Default kernel on sm_2.x 16/48 kB Shared/L1 */ - stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1); - } - CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed"); + /* On Fermi prefer L1 gives 2% higher performance */ + /* Default kernel on sm_2.x 16/48 kB Shared/L1 */ + stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1); + stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1); + stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1); + stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1); } + CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed"); } } } diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index cae773528a..0fe6e8e275 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -46,6 +46,7 @@ #include "smalloc.h" #include "tables.h" #include "typedefs.h" +#include "types/enums.h" #include "types/nb_verlet.h" #include "types/interaction_const.h" #include "types/force_flags.h" @@ -277,13 +278,27 @@ static void init_nbparam(cu_nbparam_t *nbp, nbp->rvdw_sq = ic->rvdw * ic->rvdw; nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb; nbp->rlist_sq = ic->rlist * ic->rlist; - nbp->sh_invrc6 = ic->sh_invrc6; - /* TODO: implemented LJ force- and potential-switch CUDA kernels */ - if (!(ic->vdw_modifier == eintmodNONE || - ic->vdw_modifier == eintmodPOTSHIFT)) - { - gmx_fatal(FARGS, "The CUDA kernels do not yet support switched LJ interactions"); + nbp->rvdw_switch = ic->rvdw_switch; + nbp->dispersion_shift = ic->dispersion_shift; + nbp->repulsion_shift = ic->repulsion_shift; + nbp->vdw_switch = ic->vdw_switch; + + switch (ic->vdw_modifier) + { + case eintmodNONE: + case eintmodPOTSHIFT: + nbp->vdwtype = evdwCuCUT; + break; + case eintmodFORCESWITCH: + nbp->vdwtype = evdwCuFSWITCH; + break; + case eintmodPOTSWITCH: + nbp->vdwtype = evdwCuPSWITCH; + break; + default: + gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!"); + break; } if (ic->eeltype == eelCUT) diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh index bc3dd5f43c..1ea1828873 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh @@ -112,16 +112,15 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) #endif #ifdef CALC_ENERGIES - float lj_shift = nbparam.sh_invrc6; #ifdef EL_EWALD_ANY float beta = nbparam.ewald_beta; float ewald_shift = nbparam.sh_ewald; #else float c_rf = nbparam.c_rf; -#endif - float *e_lj = atdat.e_lj; - float *e_el = atdat.e_el; -#endif +#endif /* EL_EWALD_ANY */ + float *e_lj = atdat.e_lj; + float *e_el = atdat.e_el; +#endif /* CALC_ENERGIES */ /* thread/block/warp id-s */ unsigned int tidxi = threadIdx.x; @@ -141,7 +140,10 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) int_bit, F_invr; #ifdef CALC_ENERGIES - float E_lj, E_el, E_lj_p; + float E_lj, E_el; +#endif +#if defined CALC_ENERGIES || defined LJ_POT_SWITCH + float E_lj_p; #endif unsigned int wexcl, imask, mask_ji; float4 xqbuf; @@ -336,19 +338,38 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) #endif F_invr = inv_r6 * (c12 * inv_r6 - c6) * inv_r2; +#if defined CALC_ENERGIES || defined LJ_POT_SWITCH + E_lj_p = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam.repulsion_shift.cpot)*ONE_TWELVETH_F - + c6 * (inv_r6 + nbparam.dispersion_shift.cpot)*ONE_SIXTH_F); +#endif +#ifdef LJ_FORCE_SWITCH #ifdef CALC_ENERGIES - E_lj_p = int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f); -#endif + calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p); +#else + calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr); +#endif /* CALC_ENERGIES */ +#endif /* LJ_FORCE_SWITCH */ #ifdef VDW_CUTOFF_CHECK - /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */ - vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f; - F_invr *= vdw_in_range; + /* Separate VDW cut-off check to enable twin-range cut-offs + * (rvdw < rcoulomb <= rlist) + */ + vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f; + F_invr *= vdw_in_range; #ifdef CALC_ENERGIES - E_lj_p *= vdw_in_range; -#endif + E_lj_p *= vdw_in_range; #endif +#endif /* VDW_CUTOFF_CHECK */ + +#ifdef LJ_POT_SWITCH +#ifdef CALC_ENERGIES + calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p); +#else + calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p); +#endif /* CALC_ENERGIES */ +#endif /* LJ_POT_SWITCH */ + #ifdef CALC_ENERGIES E_lj += E_lj_p; #endif diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh index bf7c8cfacf..00b6d47115 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -48,9 +48,142 @@ #define CL_SIZE_SQ (CL_SIZE * CL_SIZE) #define FBUF_STRIDE (CL_SIZE_SQ) +#define ONE_SIXTH_F 0.16666667f +#define ONE_TWELVETH_F 0.08333333f + + /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */ const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U); +/*! Apply force switch, force + energy version. */ +static inline __device__ +void calculate_force_switch_F(const cu_nbparam_t nbparam, + float c6, + float c12, + float inv_r, + float r2, + float *F_invr) +{ + float r, r_switch; + + /* force switch constants */ + float disp_shift_V2 = nbparam.dispersion_shift.c2; + float disp_shift_V3 = nbparam.dispersion_shift.c3; + float repu_shift_V2 = nbparam.repulsion_shift.c2; + float repu_shift_V3 = nbparam.repulsion_shift.c3; + + r = r2 * inv_r; + r_switch = r - nbparam.rvdw_switch; + r_switch = r_switch >= 0.0f ? r_switch : 0.0f; + + *F_invr += + -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r + + c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r; +} + +/*! Apply force switch, force-only version. */ +static inline __device__ +void calculate_force_switch_F_E(const cu_nbparam_t nbparam, + float c6, + float c12, + float inv_r, + float r2, + float *F_invr, + float *E_lj) +{ + float r, r_switch; + + /* force switch constants */ + float disp_shift_V2 = nbparam.dispersion_shift.c2; + float disp_shift_V3 = nbparam.dispersion_shift.c3; + float repu_shift_V2 = nbparam.repulsion_shift.c2; + float repu_shift_V3 = nbparam.repulsion_shift.c3; + + float disp_shift_F2 = nbparam.dispersion_shift.c2/3; + float disp_shift_F3 = nbparam.dispersion_shift.c3/4; + float repu_shift_F2 = nbparam.repulsion_shift.c2/3; + float repu_shift_F3 = nbparam.repulsion_shift.c3/4; + + r = r2 * inv_r; + r_switch = r - nbparam.rvdw_switch; + r_switch = r_switch >= 0.0f ? r_switch : 0.0f; + + *F_invr += + -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r + + c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r; + *E_lj += + c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch - + c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch; +} + +/*! Apply potential switch, force-only version. */ +static inline __device__ +void calculate_potential_switch_F(const cu_nbparam_t nbparam, + float c6, + float c12, + float inv_r, + float r2, + float *F_invr, + float *E_lj) +{ + float r, r_switch; + float sw, dsw; + + /* potential switch constants */ + float switch_V3 = nbparam.vdw_switch.c3; + float switch_V4 = nbparam.vdw_switch.c4; + float switch_V5 = nbparam.vdw_switch.c5; + float switch_F2 = 3*nbparam.vdw_switch.c3; + float switch_F3 = 4*nbparam.vdw_switch.c4; + float switch_F4 = 5*nbparam.vdw_switch.c5; + + r = r2 * inv_r; + r_switch = r - nbparam.rvdw_switch; + + /* Unlike in the F+E kernel, conditional is faster here */ + if (r_switch > 0.0f) + { + sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch; + dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch; + + *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw; + } +} + +/*! Apply potential switch, force + energy version. */ +static inline __device__ +void calculate_potential_switch_F_E(const cu_nbparam_t nbparam, + float c6, + float c12, + float inv_r, + float r2, + float *F_invr, + float *E_lj) +{ + float r, r_switch; + float sw, dsw; + + /* potential switch constants */ + float switch_V3 = nbparam.vdw_switch.c3; + float switch_V4 = nbparam.vdw_switch.c4; + float switch_V5 = nbparam.vdw_switch.c5; + float switch_F2 = 3*nbparam.vdw_switch.c3; + float switch_F3 = 4*nbparam.vdw_switch.c4; + float switch_F4 = 5*nbparam.vdw_switch.c5; + + r = r2 * inv_r; + r_switch = r - nbparam.rvdw_switch; + r_switch = r_switch >= 0.0f ? r_switch : 0.0f; + + /* Unlike in the F-only kernel, masking is faster here */ + sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch; + dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch; + + *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw; + *E_lj *= sw; +} + + /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture. * Original idea: from the OpenMM project */ diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh index 8f8c53efe7..555ec57325 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh @@ -34,8 +34,9 @@ */ /*! \internal \file - * This header has the sole purpose of generating kernels for the supported - * electrostatics types: cut-off, reaction-field, Ewald, and tabulated Ewald. + * This header has the sole purpose of generating kernels for the combinations of + * supported electrostatics types (cut-off, reaction-field, analytical and + * tabulated Ewald) and VDW types ( V shift, F switch, V swtich). * * The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which * require an extra distance check to enable PP-PME load balancing @@ -44,50 +45,148 @@ * NOTE: No include fence as it is meant to be included multiple times. */ -/* Analytical plain cut-off kernels */ +/* Analytical plain cut-off electrostatics kernels + */ #define EL_CUTOFF + +/* V shift */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJ ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" -#undef EL_CUTOFF +#undef NB_KERNEL_FUNC_NAME +/* F switch */ +#define LJ_FORCE_SWITCH +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJFsw ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_FORCE_SWITCH +#undef NB_KERNEL_FUNC_NAME +/* V switch */ +#define LJ_POT_SWITCH +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJPsw ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_POT_SWITCH #undef NB_KERNEL_FUNC_NAME -/* Analytical reaction-field kernels */ +#undef EL_CUTOFF + +/* Analytical reaction-field kernels + */ #define EL_RF + +/* V shift */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJ ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" -#undef EL_RF #undef NB_KERNEL_FUNC_NAME +/* F switch */ +#define LJ_FORCE_SWITCH +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJFsw ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_FORCE_SWITCH +#undef NB_KERNEL_FUNC_NAME +/* V switch */ +#define LJ_POT_SWITCH +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJPsw ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_POT_SWITCH +#undef NB_KERNEL_FUNC_NAME + +#undef EL_RF + /* Analytical Ewald interaction kernels */ #define EL_EWALD_ANA + +/* V shift */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJ ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" -#undef EL_EWALD_ANA #undef NB_KERNEL_FUNC_NAME +/* F switch */ +#define LJ_FORCE_SWITCH +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJFsw ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_FORCE_SWITCH +#undef NB_KERNEL_FUNC_NAME +/* V switch */ +#define LJ_POT_SWITCH +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJPsw ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_POT_SWITCH +#undef NB_KERNEL_FUNC_NAME + +#undef EL_EWALD_ANA + + /* Analytical Ewald interaction kernels with twin-range cut-off */ #define EL_EWALD_ANA -#define VDW_CUTOFF_CHECK +#define LJ_CUTOFF_CHECK + +/* V shift */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" -#undef EL_EWALD_ANA -#undef VDW_CUTOFF_CHECK +#undef NB_KERNEL_FUNC_NAME +/* F switch */ +#define LJ_FORCE_SWITCH +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJFsw ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_FORCE_SWITCH +#undef NB_KERNEL_FUNC_NAME +/* V switch */ +#define LJ_POT_SWITCH +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJPsw ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_POT_SWITCH #undef NB_KERNEL_FUNC_NAME +#undef EL_EWALD_ANA +#undef LJ_CUTOFF_CHECK + + + /* Tabulated Ewald interaction kernels */ #define EL_EWALD_TAB + +/* V shift */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJ ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" -#undef EL_EWALD_TAB +#undef NB_KERNEL_FUNC_NAME +/* F switch */ +#define LJ_FORCE_SWITCH +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJFsw ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_FORCE_SWITCH +#undef NB_KERNEL_FUNC_NAME +/* V switch */ +#define LJ_POT_SWITCH +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJPsw ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_POT_SWITCH #undef NB_KERNEL_FUNC_NAME +#undef EL_EWALD_TAB + + /* Tabulated Ewald interaction kernels with twin-range cut-off */ #define EL_EWALD_TAB -#define VDW_CUTOFF_CHECK +#define LJ_CUTOFF_CHECK + +/* V shift */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" -#undef EL_EWALD_TAB -#undef VDW_CUTOFF_CHECK #undef NB_KERNEL_FUNC_NAME +/* F switch */ +#define LJ_FORCE_SWITCH +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_FORCE_SWITCH +#undef NB_KERNEL_FUNC_NAME +/* V switch */ +#define LJ_POT_SWITCH +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_POT_SWITCH +#undef NB_KERNEL_FUNC_NAME + +#undef EL_EWALD_TAB +#undef LJ_CUTOFF_CHECK diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h index f8d7a8d076..80d75cba99 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2012, The GROMACS development team. - * Copyright (c) 2012,2013, by the GROMACS development team, led by + * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -35,9 +35,18 @@ * the research papers on the package. Check out http://www.gromacs.org. */ +/*! \internal \file + * \brief + * Data types used internally in the nbnxn_cuda module. + * + * \author Szilárd Páll + * \ingroup module_mdlib + */ + #ifndef NBNXN_CUDA_TYPES_H #define NBNXN_CUDA_TYPES_H +#include "types/interaction_const.h" #include "types/nbnxn_pairlist.h" #include "types/nbnxn_cuda_types_ext.h" #include "../../gmxlib/cuda_tools/cudautils.cuh" @@ -46,7 +55,7 @@ #if CUDA_VERSION >= 5000 #define TEXOBJ_SUPPORTED #else /* CUDA_VERSION */ -/* This typedef allows us to define only one version of struct cu_nbparam */ +/*! This typedef allows us to define only one version of struct cu_nbparam */ typedef int cudaTextureObject_t; #endif /* CUDA_VERSION */ @@ -54,7 +63,9 @@ typedef int cudaTextureObject_t; extern "C" { #endif -/** Types of electrostatics implementations available in the CUDA non-bonded +/*! \brief Electrostatic CUDA kernel flavors. + * + * Types of electrostatics implementations available in the CUDA non-bonded * force kernels. These represent both the electrostatics types implemented * by the kernels (cut-off, RF, and Ewald - a subset of what's defined in * enums.h) as well as encode implementation details analytical/tabulated @@ -62,24 +73,43 @@ extern "C" { * Note that the cut-off and RF kernels have only analytical flavor and unlike * in the CPU kernels, the tabulated kernels are ATM Ewald-only. * - * The order of pointers to different electrostatic kernels defined in - * nbnxn_cuda.cu by the nb_default_kfunc_ptr array - * should match the order of enumerated types below. */ -enum { + * The row-order of pointers to different electrostatic kernels defined in + * nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table + * should match the order of enumerated types below. + */ +enum eelCu { eelCuCUT, eelCuRF, eelCuEWALD_TAB, eelCuEWALD_TAB_TWIN, eelCuEWALD_ANA, eelCuEWALD_ANA_TWIN, eelCuNR }; +/*! \brief VdW CUDA kernel flavors. + * + * The enumerates values correspond to the LJ implementations in the CUDA non-bonded + * kernels. + * + * The column-order of pointers to different electrostatic kernels defined in + * nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table + * should match the order of enumerated types below. + */ +enum evdwCu { + evdwCuCUT, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuNR +}; + /* All structs prefixed with "cu_" hold data used in GPU calculations and * are passed to the kernels, except cu_timers_t. */ +/*! \cond */ typedef struct cu_plist cu_plist_t; typedef struct cu_atomdata cu_atomdata_t; typedef struct cu_nbparam cu_nbparam_t; typedef struct cu_timers cu_timers_t; typedef struct nb_staging nb_staging_t; +/*! \endcond */ -/** Staging area for temporary data. The energies get downloaded here first, - * before getting added to the CPU-side aggregate values. +/** \internal + * \brief Staging area for temporary data downloaded from the GPU. + * + * The energies/shift forces get downloaded here first, before getting added + * to the CPU-side aggregate values. */ struct nb_staging { @@ -88,7 +118,9 @@ struct nb_staging float3 *fshift; /**< shift forces */ }; -/** Nonbonded atom data -- both inputs and outputs. */ +/** \internal + * \brief Nonbonded atom data - both inputs and outputs. + */ struct cu_atomdata { int natoms; /**< number of atoms */ @@ -97,9 +129,9 @@ struct cu_atomdata float4 *xq; /**< atom coordinates + charges, size natoms */ float3 *f; /**< force output array, size natoms */ - /* TODO: try float2 for the energies */ - float *e_lj, /**< LJ energy output, size 1 */ - *e_el; /**< Electrostatics energy input, size 1 */ + + float *e_lj; /**< LJ energy output, size 1 */ + float *e_el; /**< Electrostatics energy input, size 1 */ float3 *fshift; /**< shift forces */ @@ -110,20 +142,29 @@ struct cu_atomdata bool bShiftVecUploaded; /**< true if the shift vector has been uploaded */ }; -/** Parameters required for the CUDA nonbonded calculations. */ +/** \internal + * \brief Parameters required for the CUDA nonbonded calculations. + */ struct cu_nbparam { - int eeltype; /**< type of electrostatics */ - - float epsfac; /**< charge multiplication factor */ - float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */ - float two_k_rf; /**< Reaction-field electrostatics constant */ - float ewald_beta; /**< Ewald/PME parameter */ - float sh_ewald; /**< Ewald/PME correction term */ - float rvdw_sq; /**< VdW cut-off */ - float rcoulomb_sq; /**< Coulomb cut-off */ - float rlist_sq; /**< pair-list cut-off */ - float sh_invrc6; /**< LJ potential correction term */ + + int eeltype; /**< type of electrostatics, takes values from #eelCu */ + int vdwtype; /**< type of VdW impl., takes values from #evdwCu */ + + float epsfac; /**< charge multiplication factor */ + float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */ + float two_k_rf; /**< Reaction-field electrostatics constant */ + float ewald_beta; /**< Ewald/PME parameter */ + float sh_ewald; /**< Ewald/PME correction term */ + float rcoulomb_sq; /**< Coulomb cut-off squared */ + + float rvdw_sq; /**< VdW cut-off squared */ + float rvdw_switch; /**< VdW switched cut-off */ + float rlist_sq; /**< pair-list cut-off squared */ + + shift_consts_t dispersion_shift; /**< VdW shift dispersion constants */ + shift_consts_t repulsion_shift; /**< VdW shift repulsion constants */ + switch_consts_t vdw_switch; /**< VdW switch constants */ /* Non-bonded parameters - accessed through texture memory */ float *nbfp; /**< nonbonded parameter table with C6/C12 pairs */ @@ -136,7 +177,9 @@ struct cu_nbparam cudaTextureObject_t coulomb_tab_texobj; /**< texture object bound to coulomb_tab */ }; -/** Pair list data */ +/** \internal + * \brief Pair list data. + */ struct cu_plist { int na_c; /**< number of atoms per cluster */ @@ -157,7 +200,9 @@ struct cu_plist done during the current step */ }; -/** CUDA events used for timing GPU kernels and H2D/D2H transfers. +/** \internal + * \brief CUDA events used for timing GPU kernels and H2D/D2H transfers. + * * The two-sized arrays hold the local and non-local values and should always * be indexed with eintLocal/eintNonlocal. */ @@ -175,7 +220,9 @@ struct cu_timers cudaEvent_t stop_nb_k[2]; /**< stop event non-bonded kernels (l/nl, every step) */ }; -/** Main data structure for CUDA nonbonded force calculations. */ +/** \internal + * \brief Main data structure for CUDA nonbonded force calculations. + */ struct nbnxn_cuda { cuda_dev_info_t *dev_info; /**< CUDA device information */ -- 2.22.0