From: Szilard Pall Date: Tue, 25 Feb 2014 20:39:07 +0000 (+0100) Subject: Added CUDA LJ-PME nbnxn kernels X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=d72a25ed4b53c22c8cd0d653f1e8b457b96b3c60;p=alexxy%2Fgromacs.git Added CUDA LJ-PME nbnxn kernels This change implements CUDA non-bonded kernels for LJ-PME introduced in the Verlet scheme with 99029d. The CUDA kernels implement geometric as well as Lorentz-Berthelot (LB) combinations rules (unlike the CPU SIMD) mostly because even though PME is very slow with LB, it is still beneficial to let the user offload the non-bondeds to a GPU and potentially bump up the cut-off to further reduce the CPU PME load. Note that as now we have 120 kernels compiled for up to four different target architectures, the nbnxn_cuda module takes a very long time to build and can become the bottleneck during compilation. We will deal with this later. Change-Id: I819b59a8948da0c8492eac6a43d4a7fb6dc98354 --- diff --git a/src/gromacs/mdlib/forcerec.c b/src/gromacs/mdlib/forcerec.c index 3784522200..95fbb19bab 100644 --- a/src/gromacs/mdlib/forcerec.c +++ b/src/gromacs/mdlib/forcerec.c @@ -1542,17 +1542,7 @@ gmx_bool nbnxn_acceleration_supported(FILE *fplog, const t_inputrec *ir, gmx_bool bGPU) { - /* TODO: remove these GPU specific restrictions by implementing CUDA kernels */ - if (bGPU) - { - if (ir->vdwtype == evdwPME) - { - md_print_warn(cr, fplog, "LJ-PME is not yet supported with GPUs, falling back to CPU only\n"); - return FALSE; - } - } - - if (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB) + if (!bGPU && (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB)) { md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with %s, falling back to %s\n", bGPU ? "GPUs" : "SIMD kernels", diff --git a/src/gromacs/mdlib/nbnxn_atomdata.c b/src/gromacs/mdlib/nbnxn_atomdata.c index ba0b9a5bbc..d780c21331 100644 --- a/src/gromacs/mdlib/nbnxn_atomdata.c +++ b/src/gromacs/mdlib/nbnxn_atomdata.c @@ -360,32 +360,39 @@ void copy_rvec_to_nbat_real(const int *a, int na, int na_round, } } -/* Stores the LJ parameter data in a format convenient for the SIMD kernels */ -static void set_ljparam_simd_data(nbnxn_atomdata_t *nbat) +/* Stores the LJ parameter data in a format convenient for different kernels */ +static void set_lj_parameter_data(nbnxn_atomdata_t *nbat, gmx_bool bSIMD) { int nt, i, j; real c6, c12; nt = nbat->ntype; - /* nbfp_s4 stores two parameters using a stride of 4, - * because this would suit x86 SIMD single-precision - * quad-load intrinsics. There's a slight inefficiency in - * allocating and initializing nbfp_s4 when it might not - * be used, but introducing the conditional code is not - * really worth it. */ - nbat->alloc((void **)&nbat->nbfp_s4, nt*nt*4*sizeof(*nbat->nbfp_s4)); - for (i = 0; i < nt; i++) + if (bSIMD) { - for (j = 0; j < nt; j++) + /* nbfp_s4 stores two parameters using a stride of 4, + * because this would suit x86 SIMD single-precision + * quad-load intrinsics. There's a slight inefficiency in + * allocating and initializing nbfp_s4 when it might not + * be used, but introducing the conditional code is not + * really worth it. */ + nbat->alloc((void **)&nbat->nbfp_s4, nt*nt*4*sizeof(*nbat->nbfp_s4)); + for (i = 0; i < nt; i++) { - nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0]; - nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1]; - nbat->nbfp_s4[(i*nt+j)*4+2] = 0; - nbat->nbfp_s4[(i*nt+j)*4+3] = 0; + for (j = 0; j < nt; j++) + { + nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0]; + nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1]; + nbat->nbfp_s4[(i*nt+j)*4+2] = 0; + nbat->nbfp_s4[(i*nt+j)*4+3] = 0; + } } } + /* We use combination rule data for SIMD combination rule kernels + * and with LJ-PME kernels. We then only need parameters per atom type, + * not per pair of atom types. + */ switch (nbat->comb_rule) { case ljcrGEOM: @@ -393,7 +400,7 @@ static void set_ljparam_simd_data(nbnxn_atomdata_t *nbat) for (i = 0; i < nt; i++) { - /* Copy the diagonal from the nbfp matrix */ + /* Store the sqrt of the diagonal from the nbfp matrix */ nbat->nbfp_comb[i*2 ] = sqrt(nbat->nbfp[(i*nt+i)*2 ]); nbat->nbfp_comb[i*2+1] = sqrt(nbat->nbfp[(i*nt+i)*2+1]); } @@ -527,7 +534,7 @@ void nbnxn_atomdata_init(FILE *fp, int i, j, nth; real c6, c12, tol; char *ptr; - gmx_bool simple, bCombGeom, bCombLB; + gmx_bool simple, bCombGeom, bCombLB, bSIMD; if (alloc == NULL) { @@ -688,10 +695,10 @@ void nbnxn_atomdata_init(FILE *fp, gmx_incons("Unknown enbnxninitcombrule"); } - if (simple) - { - set_ljparam_simd_data(nbat); - } + bSIMD = (nb_kernel_type == nbnxnk4xN_SIMD_4xN || + nb_kernel_type == nbnxnk4xN_SIMD_2xNN); + + set_lj_parameter_data(nbat, bSIMD); nbat->natoms = 0; nbat->type = NULL; @@ -700,27 +707,25 @@ void nbnxn_atomdata_init(FILE *fp, { int pack_x; - switch (nb_kernel_type) + if (bSIMD) { - case nbnxnk4xN_SIMD_4xN: - case nbnxnk4xN_SIMD_2xNN: - pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE, - nbnxn_kernel_to_cj_size(nb_kernel_type)); - switch (pack_x) - { - case 4: - nbat->XFormat = nbatX4; - break; - case 8: - nbat->XFormat = nbatX8; - break; - default: - gmx_incons("Unsupported packing width"); - } - break; - default: - nbat->XFormat = nbatXYZ; - break; + pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE, + nbnxn_kernel_to_cj_size(nb_kernel_type)); + switch (pack_x) + { + case 4: + nbat->XFormat = nbatX4; + break; + case 8: + nbat->XFormat = nbatX8; + break; + default: + gmx_incons("Unsupported packing width"); + } + } + else + { + nbat->XFormat = nbatXYZ; } nbat->FFormat = nbat->XFormat; diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu index 7477ac7863..2821e54cbd 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu @@ -62,9 +62,12 @@ #define USE_TEXOBJ #endif -/*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/ +/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */ texture nbfp_texref; +/*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */ +texture nbfp_comb_texref; + /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */ texture coulomb_tab_texref; @@ -154,45 +157,45 @@ static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo) /*! Force-only kernel function pointers. */ static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] = { - { nbnxn_kernel_ElecCut_VdwLJ_F_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda }, - { nbnxn_kernel_ElecRF_VdwLJ_F_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda }, - { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda }, - { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda }, - { nbnxn_kernel_ElecEw_VdwLJ_F_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda }, - { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda } + { nbnxn_kernel_ElecCut_VdwLJ_F_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda }, + { nbnxn_kernel_ElecRF_VdwLJ_F_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda }, + { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda }, + { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda }, + { nbnxn_kernel_ElecEw_VdwLJ_F_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda }, + { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda } }; /*! Force + energy kernel function pointers. */ static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] = { - { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda }, - { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda }, - { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda }, - { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda }, - { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda }, - { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda } + { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda }, + { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda }, + { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda }, + { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda }, + { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda }, + { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda } }; /*! Force + pruning kernel function pointers. */ static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] = { - { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda }, - { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda }, - { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda }, - { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda }, - { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda }, - { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda } + { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda }, + { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda }, + { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda }, + { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda }, + { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda }, + { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda } }; /*! Force + energy + pruning kernel function pointers. */ static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] = { - { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda }, - { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda }, - { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda }, - { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda }, - { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda }, - { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda } + { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda }, + { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda }, + { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda }, + { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda }, + { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda }, + { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda } }; /*! Return a pointer to the kernel version to be executed at the current step. */ @@ -683,6 +686,12 @@ const struct texture &nbnxn_cuda_get_nbfp_tex return nbfp_texref; } +/*! Return the reference to the nbfp_comb texture. */ +const struct texture &nbnxn_cuda_get_nbfp_comb_texref() +{ + return nbfp_comb_texref; +} + /*! Return the reference to the coulomb_tab. */ const struct texture &nbnxn_cuda_get_coulomb_tab_texref() { diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu index 0fe6e8e275..dfdafbb2fd 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu @@ -72,6 +72,7 @@ static unsigned int gpu_min_ci_balanced_factor = 40; /* Functions from nbnxn_cuda.cu */ extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo); extern const struct texture &nbnxn_cuda_get_nbfp_texref(); +extern const struct texture &nbnxn_cuda_get_nbfp_comb_texref(); extern const struct texture &nbnxn_cuda_get_coulomb_tab_texref(); /* We should actually be using md_print_warn in md_logging.c, @@ -266,39 +267,62 @@ static void init_nbparam(cu_nbparam_t *nbp, const cuda_dev_info_t *dev_info) { cudaError_t stat; - int ntypes, nnbfp; + int ntypes, nnbfp, nnbfp_comb; ntypes = nbat->ntype; - nbp->ewald_beta = ic->ewaldcoeff_q; - nbp->sh_ewald = ic->sh_ewald; - nbp->epsfac = ic->epsfac; - nbp->two_k_rf = 2.0 * ic->k_rf; - nbp->c_rf = ic->c_rf; - nbp->rvdw_sq = ic->rvdw * ic->rvdw; - nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb; - nbp->rlist_sq = ic->rlist * ic->rlist; + nbp->ewald_beta = ic->ewaldcoeff_q; + nbp->sh_ewald = ic->sh_ewald; + nbp->epsfac = ic->epsfac; + nbp->two_k_rf = 2.0 * ic->k_rf; + nbp->c_rf = ic->c_rf; + nbp->rvdw_sq = ic->rvdw * ic->rvdw; + nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb; + nbp->rlist_sq = ic->rlist * ic->rlist; + + nbp->sh_lj_ewald = ic->sh_lj_ewald; + nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj; nbp->rvdw_switch = ic->rvdw_switch; nbp->dispersion_shift = ic->dispersion_shift; nbp->repulsion_shift = ic->repulsion_shift; nbp->vdw_switch = ic->vdw_switch; - switch (ic->vdw_modifier) + if (ic->vdwtype == evdwCUT) + { + switch (ic->vdw_modifier) + { + case eintmodNONE: + case eintmodPOTSHIFT: + nbp->vdwtype = evdwCuCUT; + break; + case eintmodFORCESWITCH: + nbp->vdwtype = evdwCuFSWITCH; + break; + case eintmodPOTSWITCH: + nbp->vdwtype = evdwCuPSWITCH; + break; + default: + gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!"); + break; + } + } + else if (ic->vdwtype == evdwPME) + { + if (ic->ljpme_comb_rule == ljcrGEOM) + { + assert(nbat->comb_rule == ljcrGEOM); + nbp->vdwtype = evdwCuEWALDGEOM; + } + else + { + assert(nbat->comb_rule == ljcrLB); + nbp->vdwtype = evdwCuEWALDLB; + } + } + else { - case eintmodNONE: - case eintmodPOTSHIFT: - nbp->vdwtype = evdwCuCUT; - break; - case eintmodFORCESWITCH: - nbp->vdwtype = evdwCuFSWITCH; - break; - case eintmodPOTSWITCH: - nbp->vdwtype = evdwCuPSWITCH; - break; - default: - gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!"); - break; + gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!"); } if (ic->eeltype == eelCUT) @@ -327,16 +351,28 @@ static void init_nbparam(cu_nbparam_t *nbp, init_ewald_coulomb_force_table(nbp, dev_info); } - nnbfp = 2*ntypes*ntypes; + nnbfp = 2*ntypes*ntypes; + nnbfp_comb = 2*ntypes; + stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp)); CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp"); cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp)); + + if (ic->vdwtype == evdwPME) + { + stat = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb)); + CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb"); + cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb)); + } + #ifdef TEXOBJ_SUPPORTED /* Only device CC >= 3.0 (Kepler and later) support texture objects */ if (dev_info->prop.major >= 3) { cudaResourceDesc rd; + cudaTextureDesc td; + memset(&rd, 0, sizeof(rd)); rd.resType = cudaResourceTypeLinear; rd.res.linear.devPtr = nbp->nbfp; @@ -344,11 +380,25 @@ static void init_nbparam(cu_nbparam_t *nbp, rd.res.linear.desc.x = 32; rd.res.linear.sizeInBytes = nnbfp*sizeof(*nbp->nbfp); - cudaTextureDesc td; memset(&td, 0, sizeof(td)); td.readMode = cudaReadModeElementType; stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL); CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed"); + + if (ic->vdwtype == evdwPME) + { + memset(&rd, 0, sizeof(rd)); + rd.resType = cudaResourceTypeLinear; + rd.res.linear.devPtr = nbp->nbfp_comb; + rd.res.linear.desc.f = cudaChannelFormatKindFloat; + rd.res.linear.desc.x = 32; + rd.res.linear.sizeInBytes = nnbfp_comb*sizeof(*nbp->nbfp_comb); + + memset(&td, 0, sizeof(td)); + td.readMode = cudaReadModeElementType; + stat = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL); + CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed"); + } } else #endif @@ -357,6 +407,13 @@ static void init_nbparam(cu_nbparam_t *nbp, stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(), nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp)); CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed"); + + if (ic->vdwtype == evdwPME) + { + stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(), + nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb)); + CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed"); + } } } @@ -942,6 +999,24 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb) } cu_free_buffered(nbparam->nbfp); + if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB) + { +#ifdef TEXOBJ_SUPPORTED + /* Only device CC >= 3.0 (Kepler and later) support texture objects */ + if (cu_nb->dev_info->prop.major >= 3) + { + stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj); + CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed"); + } + else +#endif + { + stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref()); + CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed"); + } + cu_free_buffered(nbparam->nbfp_comb); + } + stat = cudaFree(atdat->shift_vec); CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec"); stat = cudaFree(atdat->fshift); diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh index 1ea1828873..2d813ec61a 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh @@ -51,6 +51,20 @@ #define EL_EWALD_ANY #endif +#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD +/* Macro to control the calculation of exclusion forces in the kernel + * We do that with Ewald (elec/vdw) and RF. + * + * Note: convenience macro, needs to be undef-ed at the end of the file. + */ +#define EXCLUSION_FORCES +#endif + +#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB +/* Note: convenience macro, needs to be undef-ed at the end of the file. */ +#define LJ_EWALD +#endif + /* Kernel launch parameters: - #blocks = #pair lists, blockId = pair list Id @@ -97,6 +111,9 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) float rvdw_sq = nbparam.rvdw_sq; float vdw_in_range; #endif +#ifdef LJ_EWALD + float lje_coeff2, lje_coeff6_6; +#endif #ifdef EL_RF float two_k_rf = nbparam.two_k_rf; #endif @@ -192,29 +209,56 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) fci_buf[ci_offset] = make_float3(0.0f); } +#ifdef LJ_EWALD + /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */ + lje_coeff2 = nbparam.ewaldcoeff_lj*nbparam.ewaldcoeff_lj; + lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*ONE_SIXTH_F; +#endif /* LJ_EWALD */ + + #ifdef CALC_ENERGIES E_lj = 0.0f; E_el = 0.0f; -#if defined EL_EWALD_ANY || defined EL_RF +#if defined EXCLUSION_FORCES /* Ewald or RF */ if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL) { - /* we have the diagonal: add the charge self interaction energy term */ + /* we have the diagonal: add the charge and LJ self interaction energy term */ for (i = 0; i < NCL_PER_SUPERCL; i++) { +#if defined EL_EWALD_ANY || defined EL_RF qi = xqib[i * CL_SIZE + tidxi].w; E_el += qi*qi; +#endif + +#if defined LJ_EWALD +#ifdef USE_TEXOBJ + E_lj += tex1Dfetch(nbparam.nbfp_texobj, atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2); +#else + E_lj += tex1Dfetch(nbfp_texref, atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2); +#endif /* USE_TEXOBJ */ +#endif /* LJ_EWALD */ + } - /* divide the self term equally over the j-threads */ + + /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */ +#ifdef LJ_EWALD + E_lj /= CL_SIZE; + E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6; +#endif /* LJ_EWALD */ + +#if defined EL_EWALD_ANY || defined EL_RF E_el /= CL_SIZE; #ifdef EL_RF E_el *= -nbparam.epsfac*0.5f*c_rf; #else E_el *= -nbparam.epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */ #endif +#endif /* EL_EWALD_ANY || defined EL_RF */ } -#endif -#endif +#endif /* EXCLUSION_FORCES */ + +#endif /* CALC_ENERGIES */ /* skip central shifts when summing shift forces */ if (nb_sci.shift == CENTRAL) @@ -300,7 +344,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f; /* cutoff & exclusion check */ -#if defined EL_EWALD_ANY || defined EL_RF +#ifdef EXCLUSION_FORCES if (r2 < rcoulomb_sq * (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi)) #else @@ -331,11 +375,11 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) inv_r = rsqrt(r2); inv_r2 = inv_r * inv_r; inv_r6 = inv_r2 * inv_r2 * inv_r2; -#if defined EL_EWALD_ANY || defined EL_RF +#if defined EXCLUSION_FORCES /* We could mask inv_r2, but with Ewald * masking both inv_r6 and F_invr is faster */ inv_r6 *= int_bit; -#endif +#endif /* EXCLUSION_FORCES */ F_invr = inv_r6 * (c12 * inv_r6 - c6) * inv_r2; #if defined CALC_ENERGIES || defined LJ_POT_SWITCH @@ -351,6 +395,25 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) #endif /* CALC_ENERGIES */ #endif /* LJ_FORCE_SWITCH */ + +#ifdef LJ_EWALD +#ifdef LJ_EWALD_COMB_GEOM +#ifdef CALC_ENERGIES + calculate_lj_ewald_comb_geom_F_E(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p); +#else + calculate_lj_ewald_comb_geom_F(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr); +#endif /* CALC_ENERGIES */ +#elif defined LJ_EWALD_COMB_LB + calculate_lj_ewald_comb_LB_F_E(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, +#ifdef CALC_ENERGIES + int_bit, &F_invr, &E_lj_p +#else + 0, &F_invr, NULL +#endif /* CALC_ENERGIES */ + ); +#endif /* LJ_EWALD_COMB_GEOM */ +#endif /* LJ_EWALD */ + #ifdef VDW_CUTOFF_CHECK /* Separate VDW cut-off check to enable twin-range cut-offs * (rvdw < rcoulomb <= rlist) @@ -486,3 +549,5 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) } #undef EL_EWALD_ANY +#undef EXCLUSION_FORCES +#undef LJ_EWALD diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh index 00b6d47115..7ea591db30 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh @@ -183,6 +183,123 @@ void calculate_potential_switch_F_E(const cu_nbparam_t nbparam, *E_lj *= sw; } +/*! Calculate LJ-PME grid force contribution with + * geometric combination rule. + */ +static inline __device__ +void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam, + int typei, + int typej, + float r2, + float inv_r2, + float lje_coeff2, + float lje_coeff6_6, + float *F_invr) +{ + float c6grid, inv_r6_nm, cr2, expmcr2, poly; + +#ifdef USE_TEXOBJ + c6grid = tex1Dfetch(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch(nbparam.nbfp_comb_texobj, 2*typej); +#else + c6grid = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej); +#endif /* USE_TEXOBJ */ + + /* Recalculate inv_r6 without exclusion mask */ + inv_r6_nm = inv_r2*inv_r2*inv_r2; + cr2 = lje_coeff2*r2; + expmcr2 = expf(-cr2); + poly = 1.0f + cr2 + 0.5f*cr2*cr2; + + /* Subtract the grid force from the total LJ force */ + *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2; +} + +/*! Calculate LJ-PME grid force + energy contribution with + * geometric combination rule. + */ +static inline __device__ +void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam, + int typei, + int typej, + float r2, + float inv_r2, + float lje_coeff2, + float lje_coeff6_6, + float int_bit, + float *F_invr, + float *E_lj) +{ + float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask; + +#ifdef USE_TEXOBJ + c6grid = tex1Dfetch(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch(nbparam.nbfp_comb_texobj, 2*typej); +#else + c6grid = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej); +#endif /* USE_TEXOBJ */ + + /* Recalculate inv_r6 without exclusion mask */ + inv_r6_nm = inv_r2*inv_r2*inv_r2; + cr2 = lje_coeff2*r2; + expmcr2 = expf(-cr2); + poly = 1.0f + cr2 + 0.5f*cr2*cr2; + + /* Subtract the grid force from the total LJ force */ + *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2; + + /* Shift should be applied only to real LJ pairs */ + sh_mask = nbparam.sh_lj_ewald*int_bit; + *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask); +} + +/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with + * Lorentz-Berthelot combination rule. + * We use a single F+E kernel with conditional because the performance impact + * of this is pretty small and LB on the CPU is anyway very slow. + */ +static inline __device__ +void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam, + int typei, + int typej, + float r2, + float inv_r2, + float lje_coeff2, + float lje_coeff6_6, + float int_bit, + float *F_invr, + float *E_lj) +{ + float c6grid, inv_r6_nm, cr2, expmcr2, poly; + float sigma, sigma2, epsilon; + + /* sigma and epsilon are scaled to give 6*C6 */ +#ifdef USE_TEXOBJ + sigma = tex1Dfetch(nbparam.nbfp_comb_texobj, 2*typei ) + tex1Dfetch(nbparam.nbfp_comb_texobj, 2*typej ); + epsilon = tex1Dfetch(nbparam.nbfp_comb_texobj, 2*typei + 1) * tex1Dfetch(nbparam.nbfp_comb_texobj, 2*typej + 1); +#else + sigma = tex1Dfetch(nbfp_comb_texref, 2*typei ) + tex1Dfetch(nbfp_comb_texref, 2*typej ); + epsilon = tex1Dfetch(nbfp_comb_texref, 2*typei + 1) * tex1Dfetch(nbfp_comb_texref, 2*typej + 1); +#endif /* USE_TEXOBJ */ + sigma2 = sigma*sigma; + c6grid = epsilon*sigma2*sigma2*sigma2; + + /* Recalculate inv_r6 without exclusion mask */ + inv_r6_nm = inv_r2*inv_r2*inv_r2; + cr2 = lje_coeff2*r2; + expmcr2 = expf(-cr2); + poly = 1.0f + cr2 + 0.5f*cr2*cr2; + + /* Subtract the grid force from the total LJ force */ + *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2; + + if (E_lj != NULL) + { + float sh_mask; + + /* Shift should be applied only to real LJ pairs */ + sh_mask = nbparam.sh_lj_ewald*int_bit; + *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask); + } +} /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture. * Original idea: from the OpenMM project diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh index 555ec57325..897b956a74 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh @@ -36,7 +36,8 @@ /*! \internal \file * This header has the sole purpose of generating kernels for the combinations of * supported electrostatics types (cut-off, reaction-field, analytical and - * tabulated Ewald) and VDW types ( V shift, F switch, V swtich). + * tabulated Ewald) and VDW types (cut-off + V shift, LJ-Ewald with + * geometric or Lorentz-Berthelot combination rule, F switch, V switch). * * The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which * require an extra distance check to enable PP-PME load balancing @@ -49,17 +50,29 @@ */ #define EL_CUTOFF -/* V shift */ +/* cut-off + V shift LJ */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJ ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" #undef NB_KERNEL_FUNC_NAME -/* F switch */ +/* LJ-Ewald w geometric combination rules */ +#define LJ_EWALD_COMB_GEOM +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombGeom ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_EWALD_COMB_GEOM +#undef NB_KERNEL_FUNC_NAME +/* LJ-Ewald w LB combination rules */ +#define LJ_EWALD_COMB_LB +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombLB ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_EWALD_COMB_LB +#undef NB_KERNEL_FUNC_NAME +/* F switch LJ */ #define LJ_FORCE_SWITCH #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJFsw ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" #undef LJ_FORCE_SWITCH #undef NB_KERNEL_FUNC_NAME -/* V switch */ +/* V switch LJ */ #define LJ_POT_SWITCH #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJPsw ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" @@ -68,21 +81,34 @@ #undef EL_CUTOFF + /* Analytical reaction-field kernels */ #define EL_RF -/* V shift */ +/* cut-off + V shift LJ */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJ ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" #undef NB_KERNEL_FUNC_NAME -/* F switch */ +/* LJ-Ewald w geometric combination rules */ +#define LJ_EWALD_COMB_GEOM +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombGeom ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_EWALD_COMB_GEOM +#undef NB_KERNEL_FUNC_NAME +/* LJ-Ewald w LB combination rules */ +#define LJ_EWALD_COMB_LB +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombLB ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_EWALD_COMB_LB +#undef NB_KERNEL_FUNC_NAME +/* F switch LJ */ #define LJ_FORCE_SWITCH #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJFsw ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" #undef LJ_FORCE_SWITCH #undef NB_KERNEL_FUNC_NAME -/* V switch */ +/* V switch LJ */ #define LJ_POT_SWITCH #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJPsw ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" @@ -96,17 +122,29 @@ */ #define EL_EWALD_ANA -/* V shift */ +/* cut-off + V shift LJ */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJ ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" #undef NB_KERNEL_FUNC_NAME -/* F switch */ +/* LJ-Ewald w geometric combination rules */ +#define LJ_EWALD_COMB_GEOM +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombGeom ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_EWALD_COMB_GEOM +#undef NB_KERNEL_FUNC_NAME +/* LJ-Ewald w LB combination rules */ +#define LJ_EWALD_COMB_LB +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombLB ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_EWALD_COMB_LB +#undef NB_KERNEL_FUNC_NAME +/* F switch LJ */ #define LJ_FORCE_SWITCH #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJFsw ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" #undef LJ_FORCE_SWITCH #undef NB_KERNEL_FUNC_NAME -/* V switch */ +/* V switch LJ */ #define LJ_POT_SWITCH #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJPsw ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" @@ -116,23 +154,34 @@ #undef EL_EWALD_ANA - /* Analytical Ewald interaction kernels with twin-range cut-off */ #define EL_EWALD_ANA #define LJ_CUTOFF_CHECK -/* V shift */ +/* cut-off + V shift LJ */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" #undef NB_KERNEL_FUNC_NAME -/* F switch */ +/* LJ-Ewald w geometric combination rules */ +#define LJ_EWALD_COMB_GEOM +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombGeom ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_EWALD_COMB_GEOM +#undef NB_KERNEL_FUNC_NAME +/* LJ-Ewald w LB combination rules */ +#define LJ_EWALD_COMB_LB +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombLB ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_EWALD_COMB_LB +#undef NB_KERNEL_FUNC_NAME +/* F switch LJ */ #define LJ_FORCE_SWITCH #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJFsw ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" #undef LJ_FORCE_SWITCH #undef NB_KERNEL_FUNC_NAME -/* V switch */ +/* V switch LJ */ #define LJ_POT_SWITCH #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJPsw ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" @@ -143,21 +192,32 @@ #undef LJ_CUTOFF_CHECK - /* Tabulated Ewald interaction kernels */ #define EL_EWALD_TAB -/* V shift */ +/* cut-off + V shift LJ */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJ ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" #undef NB_KERNEL_FUNC_NAME -/* F switch */ +/* LJ-Ewald w geometric combination rules */ +#define LJ_EWALD_COMB_GEOM +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombGeom ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_EWALD_COMB_GEOM +#undef NB_KERNEL_FUNC_NAME +/* LJ-Ewald w LB combination rules */ +#define LJ_EWALD_COMB_LB +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombLB ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_EWALD_COMB_LB +#undef NB_KERNEL_FUNC_NAME +/* F switch LJ */ #define LJ_FORCE_SWITCH #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJFsw ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" #undef LJ_FORCE_SWITCH #undef NB_KERNEL_FUNC_NAME -/* V switch */ +/* V switch LJ */ #define LJ_POT_SWITCH #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJPsw ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" @@ -171,17 +231,29 @@ #define EL_EWALD_TAB #define LJ_CUTOFF_CHECK -/* V shift */ +/* cut-off + V shift LJ */ #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" #undef NB_KERNEL_FUNC_NAME -/* F switch */ +/* LJ-Ewald w geometric combination rules */ +#define LJ_EWALD_COMB_GEOM +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombGeom ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_EWALD_COMB_GEOM +#undef NB_KERNEL_FUNC_NAME +/* LJ-Ewald w LB combination rules */ +#define LJ_EWALD_COMB_LB +#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombLB ## __VA_ARGS__ +#include "nbnxn_cuda_kernel.cuh" +#undef LJ_EWALD_COMB_LB +#undef NB_KERNEL_FUNC_NAME +/* F switch LJ */ #define LJ_FORCE_SWITCH #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" #undef LJ_FORCE_SWITCH #undef NB_KERNEL_FUNC_NAME -/* V switch */ +/* V switch LJ */ #define LJ_POT_SWITCH #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## __VA_ARGS__ #include "nbnxn_cuda_kernel.cuh" diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h index 80d75cba99..9c7192c6b4 100644 --- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h +++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h @@ -91,7 +91,7 @@ enum eelCu { * should match the order of enumerated types below. */ enum evdwCu { - evdwCuCUT, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuNR + evdwCuCUT, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuEWALDGEOM, evdwCuEWALDLB, evdwCuNR }; /* All structs prefixed with "cu_" hold data used in GPU calculations and @@ -155,7 +155,10 @@ struct cu_nbparam float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */ float two_k_rf; /**< Reaction-field electrostatics constant */ float ewald_beta; /**< Ewald/PME parameter */ - float sh_ewald; /**< Ewald/PME correction term */ + float sh_ewald; /**< Ewald/PME correction term substracted from the direct-space potential */ + float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential */ + float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient */ + float rcoulomb_sq; /**< Coulomb cut-off squared */ float rvdw_sq; /**< VdW cut-off squared */ @@ -166,9 +169,11 @@ struct cu_nbparam shift_consts_t repulsion_shift; /**< VdW shift repulsion constants */ switch_consts_t vdw_switch; /**< VdW switch constants */ - /* Non-bonded parameters - accessed through texture memory */ - float *nbfp; /**< nonbonded parameter table with C6/C12 pairs */ - cudaTextureObject_t nbfp_texobj; /**< texture object bound to nbfp */ + /* LJ non-bonded parameters - accessed through texture memory */ + float *nbfp; /**< nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements */ + cudaTextureObject_t nbfp_texobj; /**< texture object bound to nbfp */ + float *nbfp_comb; /**< nonbonded parameter table per atom type, 2*ntype elements */ + cudaTextureObject_t nbfp_comb_texobj; /**< texture object bound to nbfp_texobj */ /* Ewald Coulomb force table data - accessed through texture memory */ int coulomb_tab_size; /**< table size (s.t. it fits in texture cache) */