const t_inputrec *ir,
gmx_bool bGPU)
{
- /* TODO: remove these GPU specific restrictions by implementing CUDA kernels */
- if (bGPU)
- {
- if (ir->vdwtype == evdwPME)
- {
- md_print_warn(cr, fplog, "LJ-PME is not yet supported with GPUs, falling back to CPU only\n");
- return FALSE;
- }
- }
-
- if (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB)
+ if (!bGPU && (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB))
{
md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with %s, falling back to %s\n",
bGPU ? "GPUs" : "SIMD kernels",
}
}
-/* Stores the LJ parameter data in a format convenient for the SIMD kernels */
-static void set_ljparam_simd_data(nbnxn_atomdata_t *nbat)
+/* Stores the LJ parameter data in a format convenient for different kernels */
+static void set_lj_parameter_data(nbnxn_atomdata_t *nbat, gmx_bool bSIMD)
{
int nt, i, j;
real c6, c12;
nt = nbat->ntype;
- /* nbfp_s4 stores two parameters using a stride of 4,
- * because this would suit x86 SIMD single-precision
- * quad-load intrinsics. There's a slight inefficiency in
- * allocating and initializing nbfp_s4 when it might not
- * be used, but introducing the conditional code is not
- * really worth it. */
- nbat->alloc((void **)&nbat->nbfp_s4, nt*nt*4*sizeof(*nbat->nbfp_s4));
- for (i = 0; i < nt; i++)
+ if (bSIMD)
{
- for (j = 0; j < nt; j++)
+ /* nbfp_s4 stores two parameters using a stride of 4,
+ * because this would suit x86 SIMD single-precision
+ * quad-load intrinsics. There's a slight inefficiency in
+ * allocating and initializing nbfp_s4 when it might not
+ * be used, but introducing the conditional code is not
+ * really worth it. */
+ nbat->alloc((void **)&nbat->nbfp_s4, nt*nt*4*sizeof(*nbat->nbfp_s4));
+ for (i = 0; i < nt; i++)
{
- nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0];
- nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1];
- nbat->nbfp_s4[(i*nt+j)*4+2] = 0;
- nbat->nbfp_s4[(i*nt+j)*4+3] = 0;
+ for (j = 0; j < nt; j++)
+ {
+ nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0];
+ nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1];
+ nbat->nbfp_s4[(i*nt+j)*4+2] = 0;
+ nbat->nbfp_s4[(i*nt+j)*4+3] = 0;
+ }
}
}
+ /* We use combination rule data for SIMD combination rule kernels
+ * and with LJ-PME kernels. We then only need parameters per atom type,
+ * not per pair of atom types.
+ */
switch (nbat->comb_rule)
{
case ljcrGEOM:
for (i = 0; i < nt; i++)
{
- /* Copy the diagonal from the nbfp matrix */
+ /* Store the sqrt of the diagonal from the nbfp matrix */
nbat->nbfp_comb[i*2 ] = sqrt(nbat->nbfp[(i*nt+i)*2 ]);
nbat->nbfp_comb[i*2+1] = sqrt(nbat->nbfp[(i*nt+i)*2+1]);
}
int i, j, nth;
real c6, c12, tol;
char *ptr;
- gmx_bool simple, bCombGeom, bCombLB;
+ gmx_bool simple, bCombGeom, bCombLB, bSIMD;
if (alloc == NULL)
{
gmx_incons("Unknown enbnxninitcombrule");
}
- if (simple)
- {
- set_ljparam_simd_data(nbat);
- }
+ bSIMD = (nb_kernel_type == nbnxnk4xN_SIMD_4xN ||
+ nb_kernel_type == nbnxnk4xN_SIMD_2xNN);
+
+ set_lj_parameter_data(nbat, bSIMD);
nbat->natoms = 0;
nbat->type = NULL;
{
int pack_x;
- switch (nb_kernel_type)
+ if (bSIMD)
{
- case nbnxnk4xN_SIMD_4xN:
- case nbnxnk4xN_SIMD_2xNN:
- pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
- nbnxn_kernel_to_cj_size(nb_kernel_type));
- switch (pack_x)
- {
- case 4:
- nbat->XFormat = nbatX4;
- break;
- case 8:
- nbat->XFormat = nbatX8;
- break;
- default:
- gmx_incons("Unsupported packing width");
- }
- break;
- default:
- nbat->XFormat = nbatXYZ;
- break;
+ pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
+ nbnxn_kernel_to_cj_size(nb_kernel_type));
+ switch (pack_x)
+ {
+ case 4:
+ nbat->XFormat = nbatX4;
+ break;
+ case 8:
+ nbat->XFormat = nbatX8;
+ break;
+ default:
+ gmx_incons("Unsupported packing width");
+ }
+ }
+ else
+ {
+ nbat->XFormat = nbatXYZ;
}
nbat->FFormat = nbat->XFormat;
#define USE_TEXOBJ
#endif
-/*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
+/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
texture<float, 1, cudaReadModeElementType> nbfp_texref;
+/*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
+texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
+
/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
/*! Force-only kernel function pointers. */
static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] =
{
- { nbnxn_kernel_ElecCut_VdwLJ_F_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda },
- { nbnxn_kernel_ElecRF_VdwLJ_F_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda },
- { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda },
- { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda },
- { nbnxn_kernel_ElecEw_VdwLJ_F_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda },
- { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda }
+ { nbnxn_kernel_ElecCut_VdwLJ_F_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda },
+ { nbnxn_kernel_ElecRF_VdwLJ_F_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda },
+ { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda },
+ { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda },
+ { nbnxn_kernel_ElecEw_VdwLJ_F_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda },
+ { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda }
};
/*! Force + energy kernel function pointers. */
static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] =
{
- { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda },
- { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda },
- { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda },
- { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda },
- { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda },
- { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda }
+ { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda },
+ { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda },
+ { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda },
+ { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda },
+ { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda },
+ { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda }
};
/*! Force + pruning kernel function pointers. */
static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] =
{
- { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda },
- { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda },
- { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda },
- { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda },
- { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda },
- { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda }
+ { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda },
+ { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda },
+ { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda },
+ { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda },
+ { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda },
+ { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda }
};
/*! Force + energy + pruning kernel function pointers. */
static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] =
{
- { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda },
- { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda },
- { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda },
- { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda },
- { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda },
- { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda }
+ { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda },
+ { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda },
+ { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda },
+ { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda },
+ { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda },
+ { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda }
};
/*! Return a pointer to the kernel version to be executed at the current step. */
return nbfp_texref;
}
+/*! Return the reference to the nbfp_comb texture. */
+const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref()
+{
+ return nbfp_comb_texref;
+}
+
/*! Return the reference to the coulomb_tab. */
const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
{
/* Functions from nbnxn_cuda.cu */
extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
+extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
/* We should actually be using md_print_warn in md_logging.c,
const cuda_dev_info_t *dev_info)
{
cudaError_t stat;
- int ntypes, nnbfp;
+ int ntypes, nnbfp, nnbfp_comb;
ntypes = nbat->ntype;
- nbp->ewald_beta = ic->ewaldcoeff_q;
- nbp->sh_ewald = ic->sh_ewald;
- nbp->epsfac = ic->epsfac;
- nbp->two_k_rf = 2.0 * ic->k_rf;
- nbp->c_rf = ic->c_rf;
- nbp->rvdw_sq = ic->rvdw * ic->rvdw;
- nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
- nbp->rlist_sq = ic->rlist * ic->rlist;
+ nbp->ewald_beta = ic->ewaldcoeff_q;
+ nbp->sh_ewald = ic->sh_ewald;
+ nbp->epsfac = ic->epsfac;
+ nbp->two_k_rf = 2.0 * ic->k_rf;
+ nbp->c_rf = ic->c_rf;
+ nbp->rvdw_sq = ic->rvdw * ic->rvdw;
+ nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
+ nbp->rlist_sq = ic->rlist * ic->rlist;
+
+ nbp->sh_lj_ewald = ic->sh_lj_ewald;
+ nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
nbp->rvdw_switch = ic->rvdw_switch;
nbp->dispersion_shift = ic->dispersion_shift;
nbp->repulsion_shift = ic->repulsion_shift;
nbp->vdw_switch = ic->vdw_switch;
- switch (ic->vdw_modifier)
+ if (ic->vdwtype == evdwCUT)
+ {
+ switch (ic->vdw_modifier)
+ {
+ case eintmodNONE:
+ case eintmodPOTSHIFT:
+ nbp->vdwtype = evdwCuCUT;
+ break;
+ case eintmodFORCESWITCH:
+ nbp->vdwtype = evdwCuFSWITCH;
+ break;
+ case eintmodPOTSWITCH:
+ nbp->vdwtype = evdwCuPSWITCH;
+ break;
+ default:
+ gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
+ break;
+ }
+ }
+ else if (ic->vdwtype == evdwPME)
+ {
+ if (ic->ljpme_comb_rule == ljcrGEOM)
+ {
+ assert(nbat->comb_rule == ljcrGEOM);
+ nbp->vdwtype = evdwCuEWALDGEOM;
+ }
+ else
+ {
+ assert(nbat->comb_rule == ljcrLB);
+ nbp->vdwtype = evdwCuEWALDLB;
+ }
+ }
+ else
{
- case eintmodNONE:
- case eintmodPOTSHIFT:
- nbp->vdwtype = evdwCuCUT;
- break;
- case eintmodFORCESWITCH:
- nbp->vdwtype = evdwCuFSWITCH;
- break;
- case eintmodPOTSWITCH:
- nbp->vdwtype = evdwCuPSWITCH;
- break;
- default:
- gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
- break;
+ gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
}
if (ic->eeltype == eelCUT)
init_ewald_coulomb_force_table(nbp, dev_info);
}
- nnbfp = 2*ntypes*ntypes;
+ nnbfp = 2*ntypes*ntypes;
+ nnbfp_comb = 2*ntypes;
+
stat = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
+
+ if (ic->vdwtype == evdwPME)
+ {
+ stat = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
+ CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
+ cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
+ }
+
#ifdef TEXOBJ_SUPPORTED
/* Only device CC >= 3.0 (Kepler and later) support texture objects */
if (dev_info->prop.major >= 3)
{
cudaResourceDesc rd;
+ cudaTextureDesc td;
+
memset(&rd, 0, sizeof(rd));
rd.resType = cudaResourceTypeLinear;
rd.res.linear.devPtr = nbp->nbfp;
rd.res.linear.desc.x = 32;
rd.res.linear.sizeInBytes = nnbfp*sizeof(*nbp->nbfp);
- cudaTextureDesc td;
memset(&td, 0, sizeof(td));
td.readMode = cudaReadModeElementType;
stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
+
+ if (ic->vdwtype == evdwPME)
+ {
+ memset(&rd, 0, sizeof(rd));
+ rd.resType = cudaResourceTypeLinear;
+ rd.res.linear.devPtr = nbp->nbfp_comb;
+ rd.res.linear.desc.f = cudaChannelFormatKindFloat;
+ rd.res.linear.desc.x = 32;
+ rd.res.linear.sizeInBytes = nnbfp_comb*sizeof(*nbp->nbfp_comb);
+
+ memset(&td, 0, sizeof(td));
+ td.readMode = cudaReadModeElementType;
+ stat = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
+ CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
+ }
}
else
#endif
stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
+
+ if (ic->vdwtype == evdwPME)
+ {
+ stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
+ nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
+ CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
+ }
}
}
}
cu_free_buffered(nbparam->nbfp);
+ if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
+ {
+#ifdef TEXOBJ_SUPPORTED
+ /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+ if (cu_nb->dev_info->prop.major >= 3)
+ {
+ stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
+ CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
+ }
+ else
+#endif
+ {
+ stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
+ CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
+ }
+ cu_free_buffered(nbparam->nbfp_comb);
+ }
+
stat = cudaFree(atdat->shift_vec);
CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
stat = cudaFree(atdat->fshift);
#define EL_EWALD_ANY
#endif
+#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD
+/* Macro to control the calculation of exclusion forces in the kernel
+ * We do that with Ewald (elec/vdw) and RF.
+ *
+ * Note: convenience macro, needs to be undef-ed at the end of the file.
+ */
+#define EXCLUSION_FORCES
+#endif
+
+#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_EWALD
+#endif
+
/*
Kernel launch parameters:
- #blocks = #pair lists, blockId = pair list Id
float rvdw_sq = nbparam.rvdw_sq;
float vdw_in_range;
#endif
+#ifdef LJ_EWALD
+ float lje_coeff2, lje_coeff6_6;
+#endif
#ifdef EL_RF
float two_k_rf = nbparam.two_k_rf;
#endif
fci_buf[ci_offset] = make_float3(0.0f);
}
+#ifdef LJ_EWALD
+ /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
+ lje_coeff2 = nbparam.ewaldcoeff_lj*nbparam.ewaldcoeff_lj;
+ lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*ONE_SIXTH_F;
+#endif /* LJ_EWALD */
+
+
#ifdef CALC_ENERGIES
E_lj = 0.0f;
E_el = 0.0f;
-#if defined EL_EWALD_ANY || defined EL_RF
+#if defined EXCLUSION_FORCES /* Ewald or RF */
if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
{
- /* we have the diagonal: add the charge self interaction energy term */
+ /* we have the diagonal: add the charge and LJ self interaction energy term */
for (i = 0; i < NCL_PER_SUPERCL; i++)
{
+#if defined EL_EWALD_ANY || defined EL_RF
qi = xqib[i * CL_SIZE + tidxi].w;
E_el += qi*qi;
+#endif
+
+#if defined LJ_EWALD
+#ifdef USE_TEXOBJ
+ E_lj += tex1Dfetch<float>(nbparam.nbfp_texobj, atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2);
+#else
+ E_lj += tex1Dfetch(nbfp_texref, atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2);
+#endif /* USE_TEXOBJ */
+#endif /* LJ_EWALD */
+
}
- /* divide the self term equally over the j-threads */
+
+ /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+#ifdef LJ_EWALD
+ E_lj /= CL_SIZE;
+ E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
+#endif /* LJ_EWALD */
+
+#if defined EL_EWALD_ANY || defined EL_RF
E_el /= CL_SIZE;
#ifdef EL_RF
E_el *= -nbparam.epsfac*0.5f*c_rf;
#else
E_el *= -nbparam.epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
#endif
+#endif /* EL_EWALD_ANY || defined EL_RF */
}
-#endif
-#endif
+#endif /* EXCLUSION_FORCES */
+
+#endif /* CALC_ENERGIES */
/* skip central shifts when summing shift forces */
if (nb_sci.shift == CENTRAL)
int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
/* cutoff & exclusion check */
-#if defined EL_EWALD_ANY || defined EL_RF
+#ifdef EXCLUSION_FORCES
if (r2 < rcoulomb_sq *
(nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
#else
inv_r = rsqrt(r2);
inv_r2 = inv_r * inv_r;
inv_r6 = inv_r2 * inv_r2 * inv_r2;
-#if defined EL_EWALD_ANY || defined EL_RF
+#if defined EXCLUSION_FORCES
/* We could mask inv_r2, but with Ewald
* masking both inv_r6 and F_invr is faster */
inv_r6 *= int_bit;
-#endif
+#endif /* EXCLUSION_FORCES */
F_invr = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
#endif /* CALC_ENERGIES */
#endif /* LJ_FORCE_SWITCH */
+
+#ifdef LJ_EWALD
+#ifdef LJ_EWALD_COMB_GEOM
+#ifdef CALC_ENERGIES
+ calculate_lj_ewald_comb_geom_F_E(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
+#else
+ calculate_lj_ewald_comb_geom_F(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
+#endif /* CALC_ENERGIES */
+#elif defined LJ_EWALD_COMB_LB
+ calculate_lj_ewald_comb_LB_F_E(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
+#ifdef CALC_ENERGIES
+ int_bit, &F_invr, &E_lj_p
+#else
+ 0, &F_invr, NULL
+#endif /* CALC_ENERGIES */
+ );
+#endif /* LJ_EWALD_COMB_GEOM */
+#endif /* LJ_EWALD */
+
#ifdef VDW_CUTOFF_CHECK
/* Separate VDW cut-off check to enable twin-range cut-offs
* (rvdw < rcoulomb <= rlist)
}
#undef EL_EWALD_ANY
+#undef EXCLUSION_FORCES
+#undef LJ_EWALD
*E_lj *= sw;
}
+/*! Calculate LJ-PME grid force contribution with
+ * geometric combination rule.
+ */
+static inline __device__
+void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float *F_invr)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+
+#ifdef USE_TEXOBJ
+ c6grid = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
+#else
+ c6grid = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
+#endif /* USE_TEXOBJ */
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = expf(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+}
+
+/*! Calculate LJ-PME grid force + energy contribution with
+ * geometric combination rule.
+ */
+static inline __device__
+void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ float *F_invr,
+ float *E_lj)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
+
+#ifdef USE_TEXOBJ
+ c6grid = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
+#else
+ c6grid = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
+#endif /* USE_TEXOBJ */
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = expf(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+ /* Shift should be applied only to real LJ pairs */
+ sh_mask = nbparam.sh_lj_ewald*int_bit;
+ *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+}
+
+/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
+ * Lorentz-Berthelot combination rule.
+ * We use a single F+E kernel with conditional because the performance impact
+ * of this is pretty small and LB on the CPU is anyway very slow.
+ */
+static inline __device__
+void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float int_bit,
+ float *F_invr,
+ float *E_lj)
+{
+ float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+ float sigma, sigma2, epsilon;
+
+ /* sigma and epsilon are scaled to give 6*C6 */
+#ifdef USE_TEXOBJ
+ sigma = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei ) + tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej );
+ epsilon = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei + 1) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej + 1);
+#else
+ sigma = tex1Dfetch(nbfp_comb_texref, 2*typei ) + tex1Dfetch(nbfp_comb_texref, 2*typej );
+ epsilon = tex1Dfetch(nbfp_comb_texref, 2*typei + 1) * tex1Dfetch(nbfp_comb_texref, 2*typej + 1);
+#endif /* USE_TEXOBJ */
+ sigma2 = sigma*sigma;
+ c6grid = epsilon*sigma2*sigma2*sigma2;
+
+ /* Recalculate inv_r6 without exclusion mask */
+ inv_r6_nm = inv_r2*inv_r2*inv_r2;
+ cr2 = lje_coeff2*r2;
+ expmcr2 = expf(-cr2);
+ poly = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+ /* Subtract the grid force from the total LJ force */
+ *F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+ if (E_lj != NULL)
+ {
+ float sh_mask;
+
+ /* Shift should be applied only to real LJ pairs */
+ sh_mask = nbparam.sh_lj_ewald*int_bit;
+ *E_lj += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+ }
+}
/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
* Original idea: from the OpenMM project
/*! \internal \file
* This header has the sole purpose of generating kernels for the combinations of
* supported electrostatics types (cut-off, reaction-field, analytical and
- * tabulated Ewald) and VDW types ( V shift, F switch, V swtich).
+ * tabulated Ewald) and VDW types (cut-off + V shift, LJ-Ewald with
+ * geometric or Lorentz-Berthelot combination rule, F switch, V switch).
*
* The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
* require an extra distance check to enable PP-PME load balancing
*/
#define EL_CUTOFF
-/* V shift */
+/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJ ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef NB_KERNEL_FUNC_NAME
-/* F switch */
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombGeom ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombLB ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
#define LJ_FORCE_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJFsw ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef LJ_FORCE_SWITCH
#undef NB_KERNEL_FUNC_NAME
-/* V switch */
+/* V switch LJ */
#define LJ_POT_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJPsw ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef EL_CUTOFF
+
/* Analytical reaction-field kernels
*/
#define EL_RF
-/* V shift */
+/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJ ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef NB_KERNEL_FUNC_NAME
-/* F switch */
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombGeom ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombLB ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
#define LJ_FORCE_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJFsw ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef LJ_FORCE_SWITCH
#undef NB_KERNEL_FUNC_NAME
-/* V switch */
+/* V switch LJ */
#define LJ_POT_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJPsw ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
*/
#define EL_EWALD_ANA
-/* V shift */
+/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJ ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef NB_KERNEL_FUNC_NAME
-/* F switch */
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombGeom ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombLB ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
#define LJ_FORCE_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJFsw ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef LJ_FORCE_SWITCH
#undef NB_KERNEL_FUNC_NAME
-/* V switch */
+/* V switch LJ */
#define LJ_POT_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJPsw ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef EL_EWALD_ANA
-
/* Analytical Ewald interaction kernels with twin-range cut-off
*/
#define EL_EWALD_ANA
#define LJ_CUTOFF_CHECK
-/* V shift */
+/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef NB_KERNEL_FUNC_NAME
-/* F switch */
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombLB ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
#define LJ_FORCE_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJFsw ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef LJ_FORCE_SWITCH
#undef NB_KERNEL_FUNC_NAME
-/* V switch */
+/* V switch LJ */
#define LJ_POT_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJPsw ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef LJ_CUTOFF_CHECK
-
/* Tabulated Ewald interaction kernels */
#define EL_EWALD_TAB
-/* V shift */
+/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJ ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef NB_KERNEL_FUNC_NAME
-/* F switch */
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombGeom ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombLB ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
#define LJ_FORCE_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJFsw ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef LJ_FORCE_SWITCH
#undef NB_KERNEL_FUNC_NAME
-/* V switch */
+/* V switch LJ */
#define LJ_POT_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJPsw ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#define EL_EWALD_TAB
#define LJ_CUTOFF_CHECK
-/* V shift */
+/* cut-off + V shift LJ */
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef NB_KERNEL_FUNC_NAME
-/* F switch */
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombLB ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
#define LJ_FORCE_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
#undef LJ_FORCE_SWITCH
#undef NB_KERNEL_FUNC_NAME
-/* V switch */
+/* V switch LJ */
#define LJ_POT_SWITCH
#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## __VA_ARGS__
#include "nbnxn_cuda_kernel.cuh"
* should match the order of enumerated types below.
*/
enum evdwCu {
- evdwCuCUT, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuNR
+ evdwCuCUT, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuEWALDGEOM, evdwCuEWALDLB, evdwCuNR
};
/* All structs prefixed with "cu_" hold data used in GPU calculations and
float c_rf; /**< Reaction-field/plain cutoff electrostatics const. */
float two_k_rf; /**< Reaction-field electrostatics constant */
float ewald_beta; /**< Ewald/PME parameter */
- float sh_ewald; /**< Ewald/PME correction term */
+ float sh_ewald; /**< Ewald/PME correction term substracted from the direct-space potential */
+ float sh_lj_ewald; /**< LJ-Ewald/PME correction term added to the correction potential */
+ float ewaldcoeff_lj; /**< LJ-Ewald/PME coefficient */
+
float rcoulomb_sq; /**< Coulomb cut-off squared */
float rvdw_sq; /**< VdW cut-off squared */
shift_consts_t repulsion_shift; /**< VdW shift repulsion constants */
switch_consts_t vdw_switch; /**< VdW switch constants */
- /* Non-bonded parameters - accessed through texture memory */
- float *nbfp; /**< nonbonded parameter table with C6/C12 pairs */
- cudaTextureObject_t nbfp_texobj; /**< texture object bound to nbfp */
+ /* LJ non-bonded parameters - accessed through texture memory */
+ float *nbfp; /**< nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements */
+ cudaTextureObject_t nbfp_texobj; /**< texture object bound to nbfp */
+ float *nbfp_comb; /**< nonbonded parameter table per atom type, 2*ntype elements */
+ cudaTextureObject_t nbfp_comb_texobj; /**< texture object bound to nbfp_texobj */
/* Ewald Coulomb force table data - accessed through texture memory */
int coulomb_tab_size; /**< table size (s.t. it fits in texture cache) */