/* TODO: remove these GPU specific restrictions by implementing CUDA kernels */
if (bGPU)
{
- if (ir->vdw_modifier == eintmodFORCESWITCH ||
- ir->vdw_modifier == eintmodPOTSWITCH ||
- ir->vdwtype == evdwPME)
+ if (ir->vdwtype == evdwPME)
{
- md_print_warn(cr, fplog, "LJ switch functions and LJ-PME are not yet supported on the GPU, falling back to CPU only\n");
+ md_print_warn(cr, fplog, "LJ-PME is not yet supported with GPUs, falling back to CPU only\n");
return FALSE;
}
}
/* Constant arrays listing all kernel function pointers and enabling selection
of a kernel in an elegant manner. */
-static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
-static const int nPruneKernelTypes = 2; /* 0 - no prune, 1 - prune */
-
-/*! Pointers to the default kernels organized in a 3 dim array by:
- * electrostatics type, energy calculation on/off, and pruning on/off.
+/*! Pointers to the non-bonded kernels organized in 2-dim arrays by:
+ * electrostatics and VDW type.
*
- * Note that the order of electrostatics (1st dimension) has to match the
- * order of corresponding enumerated types defined in nbnxn_cuda_types.h.
+ * Note that the row- and column-order of function pointers has to match the
+ * order of corresponding enumerated electrostatics and vdw types, resp.,
+ * defined in nbnxn_cuda_types.h.
*/
-static const nbnxn_cu_kfunc_ptr_t
- nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+
+/*! Force-only kernel function pointers. */
+static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] =
+{
+ { nbnxn_kernel_ElecCut_VdwLJ_F_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda },
+ { nbnxn_kernel_ElecRF_VdwLJ_F_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda },
+ { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda },
+ { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda },
+ { nbnxn_kernel_ElecEw_VdwLJ_F_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda },
+ { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda }
+};
+
+/*! Force + energy kernel function pointers. */
+static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] =
+{
+ { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda },
+ { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda },
+ { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda },
+ { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda },
+ { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda },
+ { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda }
+};
+
+/*! Force + pruning kernel function pointers. */
+static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] =
{
- { { nbnxn_kernel_ElecCut_VdwLJ_F_cuda, nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda },
- { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda } },
- { { nbnxn_kernel_ElecRF_VdwLJ_F_cuda, nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda },
- { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda, nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda } },
- { { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda },
- { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda } },
- { { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda },
- { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda } },
- { { nbnxn_kernel_ElecEw_VdwLJ_F_cuda, nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda },
- { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda, nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda } },
- { { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda },
- { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda } }
+ { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda },
+ { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda },
+ { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda },
+ { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda },
+ { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda },
+ { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda }
+};
+
+/*! Force + energy + pruning kernel function pointers. */
+static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] =
+{
+ { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda },
+ { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda },
+ { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda },
+ { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda },
+ { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda },
+ { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda }
};
/*! Return a pointer to the kernel version to be executed at the current step. */
static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int eeltype,
+ int evdwtype,
bool bDoEne,
bool bDoPrune)
{
+ nbnxn_cu_kfunc_ptr_t res;
+
assert(eeltype < eelCuNR);
+ assert(evdwtype < eelCuNR);
- return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
+ if (bDoEne)
+ {
+ if (bDoPrune)
+ {
+ res = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
+ }
+ else
+ {
+ res = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
+ }
+ }
+ else
+ {
+ if (bDoPrune)
+ {
+ res = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
+ }
+ else
+ {
+ res = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
+ }
+ }
+
+ return res;
}
/*! Calculates the amount of shared memory required by the CUDA kernel in use. */
}
/* get the pointer to the kernel flavor we need to use */
- nb_kernel = select_nbnxn_kernel(nbp->eeltype, bCalcEner,
+ nb_kernel = select_nbnxn_kernel(nbp->eeltype,
+ nbp->vdwtype,
+ bCalcEner,
plist->bDoPrune || always_prune);
/* kernel launch config */
for (int i = 0; i < eelCuNR; i++)
{
- for (int j = 0; j < nEnergyKernelTypes; j++)
+ for (int j = 0; j < evdwCuNR; j++)
{
- for (int k = 0; k < nPruneKernelTypes; k++)
+ if (devinfo->prop.major >= 3)
+ {
+ /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
+ stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
+ stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
+ stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
+ }
+ else
{
- if (devinfo->prop.major >= 3)
- {
- /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
- stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
- }
- else
- {
- /* On Fermi prefer L1 gives 2% higher performance */
- /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
- stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
- }
- CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+ /* On Fermi prefer L1 gives 2% higher performance */
+ /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
+ stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
+ stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
+ stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
}
+ CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
}
}
}
#include "smalloc.h"
#include "tables.h"
#include "typedefs.h"
+#include "types/enums.h"
#include "types/nb_verlet.h"
#include "types/interaction_const.h"
#include "types/force_flags.h"
nbp->rvdw_sq = ic->rvdw * ic->rvdw;
nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
nbp->rlist_sq = ic->rlist * ic->rlist;
- nbp->sh_invrc6 = ic->sh_invrc6;
- /* TODO: implemented LJ force- and potential-switch CUDA kernels */
- if (!(ic->vdw_modifier == eintmodNONE ||
- ic->vdw_modifier == eintmodPOTSHIFT))
- {
- gmx_fatal(FARGS, "The CUDA kernels do not yet support switched LJ interactions");
+ nbp->rvdw_switch = ic->rvdw_switch;
+ nbp->dispersion_shift = ic->dispersion_shift;
+ nbp->repulsion_shift = ic->repulsion_shift;
+ nbp->vdw_switch = ic->vdw_switch;
+
+ switch (ic->vdw_modifier)
+ {
+ case eintmodNONE:
+ case eintmodPOTSHIFT:
+ nbp->vdwtype = evdwCuCUT;
+ break;
+ case eintmodFORCESWITCH:
+ nbp->vdwtype = evdwCuFSWITCH;
+ break;
+ case eintmodPOTSWITCH:
+ nbp->vdwtype = evdwCuPSWITCH;
+ break;
+ default:
+ gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
+ break;
}
if (ic->eeltype == eelCUT)
#endif
#ifdef CALC_ENERGIES
- float lj_shift = nbparam.sh_invrc6;
#ifdef EL_EWALD_ANY
float beta = nbparam.ewald_beta;
float ewald_shift = nbparam.sh_ewald;
#else
float c_rf = nbparam.c_rf;
-#endif
- float *e_lj = atdat.e_lj;
- float *e_el = atdat.e_el;
-#endif
+#endif /* EL_EWALD_ANY */
+ float *e_lj = atdat.e_lj;
+ float *e_el = atdat.e_el;
+#endif /* CALC_ENERGIES */
/* thread/block/warp id-s */
unsigned int tidxi = threadIdx.x;
int_bit,
F_invr;
#ifdef CALC_ENERGIES
- float E_lj, E_el, E_lj_p;
+ float E_lj, E_el;
+#endif
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+ float E_lj_p;
#endif
unsigned int wexcl, imask, mask_ji;
float4 xqbuf;
#endif
F_invr = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+ E_lj_p = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam.repulsion_shift.cpot)*ONE_TWELVETH_F -
+ c6 * (inv_r6 + nbparam.dispersion_shift.cpot)*ONE_SIXTH_F);
+#endif
+#ifdef LJ_FORCE_SWITCH
#ifdef CALC_ENERGIES
- E_lj_p = int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f);
-#endif
+ calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+ calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_FORCE_SWITCH */
#ifdef VDW_CUTOFF_CHECK
- /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
- vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
- F_invr *= vdw_in_range;
+ /* Separate VDW cut-off check to enable twin-range cut-offs
+ * (rvdw < rcoulomb <= rlist)
+ */
+ vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+ F_invr *= vdw_in_range;
#ifdef CALC_ENERGIES
- E_lj_p *= vdw_in_range;
-#endif
+ E_lj_p *= vdw_in_range;
#endif
+#endif /* VDW_CUTOFF_CHECK */
+
+#ifdef LJ_POT_SWITCH
+#ifdef CALC_ENERGIES
+ calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+ calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_POT_SWITCH */
+
#ifdef CALC_ENERGIES
E_lj += E_lj_p;
#endif
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
#define FBUF_STRIDE (CL_SIZE_SQ)
+#define ONE_SIXTH_F 0.16666667f
+#define ONE_TWELVETH_F 0.08333333f
+
+
/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
+/*! Apply force switch, force + energy version. */
+static inline __device__
+void calculate_force_switch_F(const cu_nbparam_t nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr)
+{
+ float r, r_switch;
+
+ /* force switch constants */
+ float disp_shift_V2 = nbparam.dispersion_shift.c2;
+ float disp_shift_V3 = nbparam.dispersion_shift.c3;
+ float repu_shift_V2 = nbparam.repulsion_shift.c2;
+ float repu_shift_V3 = nbparam.repulsion_shift.c3;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam.rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ *F_invr +=
+ -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
+ c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+}
+
+/*! Apply force switch, force-only version. */
+static inline __device__
+void calculate_force_switch_F_E(const cu_nbparam_t nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+
+ /* force switch constants */
+ float disp_shift_V2 = nbparam.dispersion_shift.c2;
+ float disp_shift_V3 = nbparam.dispersion_shift.c3;
+ float repu_shift_V2 = nbparam.repulsion_shift.c2;
+ float repu_shift_V3 = nbparam.repulsion_shift.c3;
+
+ float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
+ float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
+ float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
+ float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam.rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ *F_invr +=
+ -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
+ c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+ *E_lj +=
+ c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
+ c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
+}
+
+/*! Apply potential switch, force-only version. */
+static inline __device__
+void calculate_potential_switch_F(const cu_nbparam_t nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+ float sw, dsw;
+
+ /* potential switch constants */
+ float switch_V3 = nbparam.vdw_switch.c3;
+ float switch_V4 = nbparam.vdw_switch.c4;
+ float switch_V5 = nbparam.vdw_switch.c5;
+ float switch_F2 = 3*nbparam.vdw_switch.c3;
+ float switch_F3 = 4*nbparam.vdw_switch.c4;
+ float switch_F4 = 5*nbparam.vdw_switch.c5;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam.rvdw_switch;
+
+ /* Unlike in the F+E kernel, conditional is faster here */
+ if (r_switch > 0.0f)
+ {
+ sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+ *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ }
+}
+
+/*! Apply potential switch, force + energy version. */
+static inline __device__
+void calculate_potential_switch_F_E(const cu_nbparam_t nbparam,
+ float c6,
+ float c12,
+ float inv_r,
+ float r2,
+ float *F_invr,
+ float *E_lj)
+{
+ float r, r_switch;
+ float sw, dsw;
+
+ /* potential switch constants */
+ float switch_V3 = nbparam.vdw_switch.c3;
+ float switch_V4 = nbparam.vdw_switch.c4;
+ float switch_V5 = nbparam.vdw_switch.c5;
+ float switch_F2 = 3*nbparam.vdw_switch.c3;
+ float switch_F3 = 4*nbparam.vdw_switch.c4;
+ float switch_F4 = 5*nbparam.vdw_switch.c5;
+
+ r = r2 * inv_r;
+ r_switch = r - nbparam.rvdw_switch;
+ r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+ /* Unlike in the F-only kernel, masking is faster here */
+ sw = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+ dsw = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+ *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+ *E_lj *= sw;
+}
+
+
/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
* Original idea: from the OpenMM project
*/
*/
/*! \internal \file
- * This header has the sole purpose of generating kernels for the supported
- * electrostatics types: cut-off, reaction-field, Ewald, and tabulated Ewald.
+ * This header has the sole purpose of generating kernels for the combinations of
+ * supported electrostatics types (cut-off, reaction-field, analytical and
+ * tabulated Ewald) and VDW types ( V shift, F switch, V swtich).
*
* The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
* require an extra distance check to enable PP-PME load balancing
* NOTE: No include fence as it is meant to be included multiple times.
*/
-/* Analytical plain cut-off kernels */
+/* Analytical plain cut-off electrostatics kernels
+ */
#define EL_CUTOFF
+
+/* V shift */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJ ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
-#undef EL_CUTOFF
+#undef NB_KERNEL_FUNC_NAME
+/* F switch */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJFsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJPsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_POT_SWITCH
#undef NB_KERNEL_FUNC_NAME
-/* Analytical reaction-field kernels */
+#undef EL_CUTOFF
+
+/* Analytical reaction-field kernels
+ */
#define EL_RF
+
+/* V shift */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJ ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
-#undef EL_RF
#undef NB_KERNEL_FUNC_NAME
+/* F switch */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJFsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJPsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_RF
+
/* Analytical Ewald interaction kernels
*/
#define EL_EWALD_ANA
+
+/* V shift */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJ ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD_ANA
#undef NB_KERNEL_FUNC_NAME
+/* F switch */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJFsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJPsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_ANA
+
+
/* Analytical Ewald interaction kernels with twin-range cut-off
*/
#define EL_EWALD_ANA
-#define VDW_CUTOFF_CHECK
+#define LJ_CUTOFF_CHECK
+
+/* V shift */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD_ANA
-#undef VDW_CUTOFF_CHECK
+#undef NB_KERNEL_FUNC_NAME
+/* F switch */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJFsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJPsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_POT_SWITCH
#undef NB_KERNEL_FUNC_NAME
+#undef EL_EWALD_ANA
+#undef LJ_CUTOFF_CHECK
+
+
+
/* Tabulated Ewald interaction kernels */
#define EL_EWALD_TAB
+
+/* V shift */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJ ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD_TAB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJFsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJPsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_POT_SWITCH
#undef NB_KERNEL_FUNC_NAME
+#undef EL_EWALD_TAB
+
+
/* Tabulated Ewald interaction kernels with twin-range cut-off */
#define EL_EWALD_TAB
-#define VDW_CUTOFF_CHECK
+#define LJ_CUTOFF_CHECK
+
+/* V shift */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD_TAB
-#undef VDW_CUTOFF_CHECK
#undef NB_KERNEL_FUNC_NAME
+/* F switch */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_TAB
+#undef LJ_CUTOFF_CHECK
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2012, The GROMACS development team.
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* the research papers on the package. Check out http://www.gromacs.org.
*/
+/*! \internal \file
+ * \brief
+ * Data types used internally in the nbnxn_cuda module.
+ *
+ * \author Szilárd Páll <pall.szilard@gmail.com>
+ * \ingroup module_mdlib
+ */
+
#ifndef NBNXN_CUDA_TYPES_H
#define NBNXN_CUDA_TYPES_H
+#include "types/interaction_const.h"
#include "types/nbnxn_pairlist.h"
#include "types/nbnxn_cuda_types_ext.h"
#include "../../gmxlib/cuda_tools/cudautils.cuh"
#if CUDA_VERSION >= 5000
#define TEXOBJ_SUPPORTED
#else /* CUDA_VERSION */
-/* This typedef allows us to define only one version of struct cu_nbparam */
+/*! This typedef allows us to define only one version of struct cu_nbparam */
typedef int cudaTextureObject_t;
#endif /* CUDA_VERSION */
extern "C" {
#endif
-/** Types of electrostatics implementations available in the CUDA non-bonded
+/*! \brief Electrostatic CUDA kernel flavors.
+ *
+ * Types of electrostatics implementations available in the CUDA non-bonded
* force kernels. These represent both the electrostatics types implemented
* by the kernels (cut-off, RF, and Ewald - a subset of what's defined in
* enums.h) as well as encode implementation details analytical/tabulated
* Note that the cut-off and RF kernels have only analytical flavor and unlike
* in the CPU kernels, the tabulated kernels are ATM Ewald-only.
*
- * The order of pointers to different electrostatic kernels defined in
- * nbnxn_cuda.cu by the nb_default_kfunc_ptr array
- * should match the order of enumerated types below. */
-enum {
+ * The row-order of pointers to different electrostatic kernels defined in
+ * nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
+ * should match the order of enumerated types below.
+ */
+enum eelCu {
eelCuCUT, eelCuRF, eelCuEWALD_TAB, eelCuEWALD_TAB_TWIN, eelCuEWALD_ANA, eelCuEWALD_ANA_TWIN, eelCuNR
};
+/*! \brief VdW CUDA kernel flavors.
+ *
+ * The enumerates values correspond to the LJ implementations in the CUDA non-bonded
+ * kernels.
+ *
+ * The column-order of pointers to different electrostatic kernels defined in
+ * nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
+ * should match the order of enumerated types below.
+ */
+enum evdwCu {
+ evdwCuCUT, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuNR
+};
+
/* All structs prefixed with "cu_" hold data used in GPU calculations and
* are passed to the kernels, except cu_timers_t. */
+/*! \cond */
typedef struct cu_plist cu_plist_t;
typedef struct cu_atomdata cu_atomdata_t;
typedef struct cu_nbparam cu_nbparam_t;
typedef struct cu_timers cu_timers_t;
typedef struct nb_staging nb_staging_t;
+/*! \endcond */
-/** Staging area for temporary data. The energies get downloaded here first,
- * before getting added to the CPU-side aggregate values.
+/** \internal
+ * \brief Staging area for temporary data downloaded from the GPU.
+ *
+ * The energies/shift forces get downloaded here first, before getting added
+ * to the CPU-side aggregate values.
*/
struct nb_staging
{
float3 *fshift; /**< shift forces */
};
-/** Nonbonded atom data -- both inputs and outputs. */
+/** \internal
+ * \brief Nonbonded atom data - both inputs and outputs.
+ */
struct cu_atomdata
{
int natoms; /**< number of atoms */
float4 *xq; /**< atom coordinates + charges, size natoms */
float3 *f; /**< force output array, size natoms */
- /* TODO: try float2 for the energies */
- float *e_lj, /**< LJ energy output, size 1 */
- *e_el; /**< Electrostatics energy input, size 1 */
+
+ float *e_lj; /**< LJ energy output, size 1 */
+ float *e_el; /**< Electrostatics energy input, size 1 */
float3 *fshift; /**< shift forces */
bool bShiftVecUploaded; /**< true if the shift vector has been uploaded */
};
-/** Parameters required for the CUDA nonbonded calculations. */
+/** \internal
+ * \brief Parameters required for the CUDA nonbonded calculations.
+ */
struct cu_nbparam
{
- int eeltype; /**< type of electrostatics */
-
- float epsfac; /**< charge multiplication factor */
- float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */
- float two_k_rf; /**< Reaction-field electrostatics constant */
- float ewald_beta; /**< Ewald/PME parameter */
- float sh_ewald; /**< Ewald/PME correction term */
- float rvdw_sq; /**< VdW cut-off */
- float rcoulomb_sq; /**< Coulomb cut-off */
- float rlist_sq; /**< pair-list cut-off */
- float sh_invrc6; /**< LJ potential correction term */
+
+ int eeltype; /**< type of electrostatics, takes values from #eelCu */
+ int vdwtype; /**< type of VdW impl., takes values from #evdwCu */
+
+ float epsfac; /**< charge multiplication factor */
+ float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */
+ float two_k_rf; /**< Reaction-field electrostatics constant */
+ float ewald_beta; /**< Ewald/PME parameter */
+ float sh_ewald; /**< Ewald/PME correction term */
+ float rcoulomb_sq; /**< Coulomb cut-off squared */
+
+ float rvdw_sq; /**< VdW cut-off squared */
+ float rvdw_switch; /**< VdW switched cut-off */
+ float rlist_sq; /**< pair-list cut-off squared */
+
+ shift_consts_t dispersion_shift; /**< VdW shift dispersion constants */
+ shift_consts_t repulsion_shift; /**< VdW shift repulsion constants */
+ switch_consts_t vdw_switch; /**< VdW switch constants */
/* Non-bonded parameters - accessed through texture memory */
float *nbfp; /**< nonbonded parameter table with C6/C12 pairs */
cudaTextureObject_t coulomb_tab_texobj; /**< texture object bound to coulomb_tab */
};
-/** Pair list data */
+/** \internal
+ * \brief Pair list data.
+ */
struct cu_plist
{
int na_c; /**< number of atoms per cluster */
done during the current step */
};
-/** CUDA events used for timing GPU kernels and H2D/D2H transfers.
+/** \internal
+ * \brief CUDA events used for timing GPU kernels and H2D/D2H transfers.
+ *
* The two-sized arrays hold the local and non-local values and should always
* be indexed with eintLocal/eintNonlocal.
*/
cudaEvent_t stop_nb_k[2]; /**< stop event non-bonded kernels (l/nl, every step) */
};
-/** Main data structure for CUDA nonbonded force calculations. */
+/** \internal
+ * \brief Main data structure for CUDA nonbonded force calculations.
+ */
struct nbnxn_cuda
{
cuda_dev_info_t *dev_info; /**< CUDA device information */