real rdd, real rconstr, const char *dddlb_opt, real dlb_scale,
const char *ddcsx, const char *ddcsy, const char *ddcsz,
const char *nbpu_opt,
- int nsteps_cmdline, int nstepout, int resetstep,
+ gmx_large_int_t nsteps_cmdline, int nstepout, int resetstep,
int nmultisim, int repl_ex_nst, int repl_ex_nex,
int repl_ex_seed, real pforce, real cpt_period, real max_hours,
const char *deviceOptions, unsigned long Flags);
SEL_CDATA_MINMAXALLOC = 16,
/** Whether subexpressions use simple pass evaluation functions. */
SEL_CDATA_SIMPLESUBEXPR = 32,
- /** Whether this expressions is a part of a common subexpression. */
- SEL_CDATA_COMMONSUBEXPR = 64
+ /*! \brief
+ * Whether a static subexpression needs to support multiple evaluations.
+ *
+ * This flag may only be set on \ref SEL_SUBEXPR elements that also have
+ * SEL_CDATA_SIMPLESUBEXPR.
+ */
+ SEL_CDATA_STATICMULTIEVALSUBEXPR = 64,
+ /** Whether this expression is a part of a common subexpression. */
+ SEL_CDATA_COMMONSUBEXPR = 128
};
/*! \internal \brief
{
fprintf(fp, "Ss");
}
+ if (sel->cdata->flags & SEL_CDATA_STATICMULTIEVALSUBEXPR)
+ {
+ fprintf(fp, "Sm");
+ }
if (sel->cdata->flags & SEL_CDATA_COMMONSUBEXPR)
{
fprintf(fp, "Sc");
break;
case SEL_SUBEXPR:
- sel->evaluate = (sel->refcount == 2
- ? &_gmx_sel_evaluate_subexpr_simple
- : &_gmx_sel_evaluate_subexpr);
+ if (sel->refcount == 2
+ && !(sel->cdata->flags & SEL_CDATA_STATICMULTIEVALSUBEXPR))
+ {
+ sel->evaluate = &_gmx_sel_evaluate_subexpr_simple;
+ }
+ else
+ {
+ sel->evaluate = &_gmx_sel_evaluate_subexpr;
+ }
break;
case SEL_SUBEXPRREF:
sel->name = sel->child->name;
- sel->evaluate = (sel->child->refcount == 2
+ sel->evaluate = ((sel->cdata->flags & SEL_CDATA_SIMPLESUBEXPR)
? &_gmx_sel_evaluate_subexprref_simple
: &_gmx_sel_evaluate_subexprref);
break;
}
-
+ sel->cdata->evaluate = sel->evaluate;
return TRUE;
}
}
}
- if (sel->type == SEL_SUBEXPR && sel->refcount == 2)
+ if (sel->type == SEL_SUBEXPR && sel->refcount == 2
+ && !(sel->cdata->flags & SEL_CDATA_STATICMULTIEVALSUBEXPR))
{
sel->flags &= ~(SEL_ALLOCVAL | SEL_ALLOCDATA);
if (sel->v.type == GROUP_VALUE || sel->v.type == POS_VALUE)
_gmx_selvalue_setstore(&sel->v, sel->child->v.u.ptr);
}
}
- else if (sel->type == SEL_SUBEXPRREF && sel->child->refcount == 2)
+ else if (sel->type == SEL_SUBEXPRREF
+ && (sel->cdata->flags & SEL_CDATA_SIMPLESUBEXPR))
{
if (sel->v.u.ptr)
{
init_item_staticeval(child);
}
}
+ /* If an expression is evaluated for a dynamic group, then also
+ * atom-valued parameters need to be evaluated every time. */
+ if ((sel->flags & SEL_DYNAMIC)
+ && (sel->type == SEL_EXPRESSION || sel->type == SEL_MODIFIER)
+ && (child->flags & SEL_ATOMVAL))
+ {
+ child->flags |= SEL_DYNAMIC;
+ child->cdata->flags &= ~SEL_CDATA_STATIC;
+ }
child = child->next;
}
}
}
else if (sel->type == SEL_SUBEXPRREF && sel->child->refcount == 2)
{
- sel->cdata->flags |= SEL_CDATA_SIMPLESUBEXPR;
+ /* See similar condition in init_item_staticeval(). */
+ if ((sel->flags & SEL_ATOMVAL)
+ && (sel->flags & SEL_DYNAMIC)
+ && !(sel->child->flags & SEL_DYNAMIC))
+ {
+ sel->child->cdata->flags |= SEL_CDATA_STATICMULTIEVALSUBEXPR;
+ }
+ else
+ {
+ sel->cdata->flags |= SEL_CDATA_SIMPLESUBEXPR;
+ }
}
/* Process children, but only follow subexpression references if the
break;
case SEL_SUBEXPR:
- if (sel->cdata->flags & (SEL_CDATA_SIMPLESUBEXPR | SEL_CDATA_FULLEVAL))
+ if (((sel->cdata->flags & SEL_CDATA_SIMPLESUBEXPR) &&
+ !(sel->cdata->flags & SEL_CDATA_STATICMULTIEVALSUBEXPR))
+ || (sel->cdata->flags & SEL_CDATA_FULLEVAL))
{
rc = sel->cdata->evaluate(data, sel, g);
_gmx_selvalue_setstore(&sel->v, sel->child->v.u.ptr);
sel->child->flags &= ~(SEL_ALLOCVAL | SEL_ALLOCDATA);
sel->child->v.nalloc = -1;
}
+
+ /* For static subexpressions with a dynamic evaluation group, there is
+ * no need to evaluate them again, as the SEL_SUBEXPRREF takes care of
+ * everything during evaluation. */
+ if (sel->type == SEL_SUBEXPR
+ && (sel->cdata->flags & SEL_CDATA_SIMPLESUBEXPR)
+ && (sel->cdata->flags & SEL_CDATA_STATICMULTIEVALSUBEXPR))
+ {
+ sel->evaluate = NULL;
+ sel->cdata->evaluate = NULL;
+ }
}
/* FIXME: Clean up the collection */
return -1;
}
- /* Initialize evaluation function. */
- if (!init_item_evalfunc(item))
- {
- /* FIXME: Clean up the collection */
- return -1;
- }
- setup_memory_pooling(item, sc->mempool);
/* Initialize the compiler data */
init_item_compilerdata(item);
+ item = item->next;
+ }
+ /* Initialize the static evaluation compiler flags.
+ * Requires the FULLEVAL compiler flag for the whole tree. */
+ item = sc->root;
+ while (item)
+ {
init_item_staticeval(item);
item = item->next;
}
- /* Initialize subexpression flags and evaluation output.
+ /* Initialize subexpression flags.
* Requires compiler flags for the full tree. */
item = sc->root;
while (item)
{
init_item_subexpr_flags(item);
+ item = item->next;
+ }
+ /* Initialize evaluation.
+ * Requires subexpression flags. */
+ item = sc->root;
+ while (item)
+ {
+ if (!init_item_evalfunc(item))
+ {
+ /* FIXME: Clean up the collection */
+ return -1;
+ }
+ setup_memory_pooling(item, sc->mempool);
init_item_evaloutput(item);
item = item->next;
}
while (sel)
{
/* Clear the evaluation group of subexpressions */
- if (sel->child && sel->child->type == SEL_SUBEXPR)
+ if (sel->child && sel->child->type == SEL_SUBEXPR
+ && sel->child->evaluate != NULL)
{
sel->child->u.cgrp.isize = 0;
/* Not strictly necessary, because the value will be overwritten
t_selelem *expr;
int i, j;
- if (g)
+ if (g != NULL && sel->child->evaluate != NULL)
{
int rc;
int repl_ex_nex = 0;
int nstepout = 100;
int resetstep = -1;
- int nsteps = -2; /* the value -2 means that the mdp option will be used */
+ gmx_large_int_t nsteps = -2; /* the value -2 means that the mdp option will be used */
rvec realddxyz = {0, 0, 0};
const char *ddno_opt[ddnoNR+1] =
"Keep and number checkpoint files" },
{ "-append", FALSE, etBOOL, {&bAppendFiles},
"Append to previous output files when continuing from checkpoint instead of adding the simulation part number to all file names" },
- { "-nsteps", FALSE, etINT, {&nsteps},
+ { "-nsteps", FALSE, etGMX_LARGE_INT, {&nsteps},
"Run this number of steps, overrides .mdp file option" },
{ "-maxh", FALSE, etREAL, {&max_hours},
"Terminate after 0.99 times this time (hours)" },
const char *ddcsy;
const char *ddcsz;
const char *nbpu_opt;
- int nsteps_cmdline;
+ gmx_large_int_t nsteps_cmdline;
int nstepout;
int resetstep;
int nmultisim;
const char *dddlb_opt, real dlb_scale,
const char *ddcsx, const char *ddcsy, const char *ddcsz,
const char *nbpu_opt,
- int nsteps_cmdline, int nstepout, int resetstep,
+ gmx_large_int_t nsteps_cmdline,
+ int nstepout, int resetstep,
int nmultisim, int repl_ex_nst, int repl_ex_nex, int repl_ex_seed,
real pforce, real cpt_period, real max_hours,
const char *deviceOptions, unsigned long Flags)
/* Override the value in inputrec with value passed on the command line (if any) */
static void override_nsteps_cmdline(FILE *fplog,
- int nsteps_cmdline,
+ gmx_large_int_t nsteps_cmdline,
t_inputrec *ir,
const t_commrec *cr)
{
+ char sbuf[STEPSTRSIZE];
+
assert(ir);
assert(cr);
ir->nsteps = nsteps_cmdline;
if (EI_DYNAMICS(ir->eI))
{
- sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps, %.3f ps",
- nsteps_cmdline, nsteps_cmdline*ir->delta_t);
+ sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps, %.3f ps",
+ gmx_step_str(nsteps_cmdline, sbuf),
+ nsteps_cmdline*ir->delta_t);
}
else
{
- sprintf(stmp, "Overriding nsteps with value passed on the command line: %d steps",
- nsteps_cmdline);
+ sprintf(stmp, "Overriding nsteps with value passed on the command line: %s steps",
+ gmx_step_str(nsteps_cmdline, sbuf));
}
md_print_warn(cr, fplog, "%s\n", stmp);
const char *dddlb_opt, real dlb_scale,
const char *ddcsx, const char *ddcsy, const char *ddcsz,
const char *nbpu_opt,
- int nsteps_cmdline, int nstepout, int resetstep,
+ gmx_large_int_t nsteps_cmdline, int nstepout, int resetstep,
int nmultisim, int repl_ex_nst, int repl_ex_nex,
int repl_ex_seed, real pforce, real cpt_period, real max_hours,
const char *deviceOptions, unsigned long Flags)
Ek = trace(ekind->tcstat[i].ekinh);
}
- if (opts->tau_t[i] > 0 && opts->nrdf[i] > 0 && Ek > 0)
+ if (opts->tau_t[i] >= 0 && opts->nrdf[i] > 0 && Ek > 0)
{
Ek_ref1 = 0.5*opts->ref_t[i]*BOLTZ;
Ek_ref = Ek_ref1*opts->nrdf[i];
/***** The kernels come here *****/
#include "nbnxn_cuda_kernel_utils.cuh"
-/* Generate all combinations of kernels through multiple inclusion:
- F, F + E, F + prune, F + E + prune. */
+/* Top-level kernel generation: will generate through multiple inclusion the
+ * following flavors for all kernels:
+ * - force-only output;
+ * - force and energy output;
+ * - force-only with pair list pruning;
+ * - force and energy output with pair list pruning.
+ */
/** Force only **/
#include "nbnxn_cuda_kernels.cuh"
/** Force & energy **/
/* do we exceed the grid x dimension limit? */
if (nwork_units > max_grid_x_size)
{
- gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
+ gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
"The number of nonbonded work units (=number of super-clusters) exceeds the"
"maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
}
static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
static const int nPruneKernelTypes = 2; /* 0 - no prune, 1 - prune */
-/* Default kernels */
+/*! Pointers to the default kernels organized in a 3 dim array by:
+ * electrostatics type, energy calculation on/off, and pruning on/off.
+ *
+ * Note that the order of electrostatics (1st dimension) has to match the
+ * order of corresponding enumerated types defined in nbnxn_cuda_types.h.
+ */
static const nbnxn_cu_kfunc_ptr_t
nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
{
- { { k_nbnxn_ewald, k_nbnxn_ewald_prune },
- { k_nbnxn_ewald_ener, k_nbnxn_ewald_ener_prune } },
- { { k_nbnxn_ewald_twin, k_nbnxn_ewald_twin_prune },
- { k_nbnxn_ewald_twin_ener, k_nbnxn_ewald_twin_ener_prune } },
- { { k_nbnxn_rf, k_nbnxn_rf_prune },
- { k_nbnxn_rf_ener, k_nbnxn_rf_ener_prune } },
- { { k_nbnxn_cutoff, k_nbnxn_cutoff_prune },
- { k_nbnxn_cutoff_ener, k_nbnxn_cutoff_ener_prune } },
+ { { k_nbnxn_cutoff, k_nbnxn_cutoff_prune },
+ { k_nbnxn_cutoff_ener, k_nbnxn_cutoff_ener_prune } },
+ { { k_nbnxn_rf, k_nbnxn_rf_prune },
+ { k_nbnxn_rf_ener, k_nbnxn_rf_ener_prune } },
+ { { k_nbnxn_ewald_tab, k_nbnxn_ewald_tab_prune },
+ { k_nbnxn_ewald_tab_ener, k_nbnxn_ewald_tab_ener_prune } },
+ { { k_nbnxn_ewald_tab_twin, k_nbnxn_ewald_tab_twin_prune },
+ { k_nbnxn_ewald_tab_twin_ener, k_nbnxn_ewald_twin_ener_prune } },
+ { { k_nbnxn_ewald, k_nbnxn_ewald_prune },
+ { k_nbnxn_ewald_ener, k_nbnxn_ewald_ener_prune } },
+ { { k_nbnxn_ewald_twin, k_nbnxn_ewald_twin_prune },
+ { k_nbnxn_ewald_twin_ener, k_nbnxn_ewald_twin_ener_prune } },
};
-/* Legacy kernels */
+/*! Pointers to the legacy kernels organized in a 3 dim array by:
+ * electrostatics type, energy calculation on/off, and pruning on/off.
+ *
+ * Note that the order of electrostatics (1st dimension) has to match the
+ * order of corresponding enumerated types defined in nbnxn_cuda_types.h.
+ */
static const nbnxn_cu_kfunc_ptr_t
nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
{
- { { k_nbnxn_ewald_legacy, k_nbnxn_ewald_prune_legacy },
- { k_nbnxn_ewald_ener_legacy, k_nbnxn_ewald_ener_prune_legacy } },
- { { k_nbnxn_ewald_twin_legacy, k_nbnxn_ewald_twin_prune_legacy },
- { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
- { { k_nbnxn_rf_legacy, k_nbnxn_rf_prune_legacy },
- { k_nbnxn_rf_ener_legacy, k_nbnxn_rf_ener_prune_legacy } },
- { { k_nbnxn_cutoff_legacy, k_nbnxn_cutoff_prune_legacy },
- { k_nbnxn_cutoff_ener_legacy, k_nbnxn_cutoff_ener_prune_legacy } },
+ { { k_nbnxn_cutoff_legacy, k_nbnxn_cutoff_prune_legacy },
+ { k_nbnxn_cutoff_ener_legacy, k_nbnxn_cutoff_ener_prune_legacy } },
+ { { k_nbnxn_rf_legacy, k_nbnxn_rf_prune_legacy },
+ { k_nbnxn_rf_ener_legacy, k_nbnxn_rf_ener_prune_legacy } },
+ { { k_nbnxn_ewald_tab_legacy, k_nbnxn_ewald_tab_prune_legacy },
+ { k_nbnxn_ewald_tab_ener_legacy, k_nbnxn_ewald_tab_ener_prune_legacy } },
+ { { k_nbnxn_ewald_tab_twin_legacy, k_nbnxn_ewald_tab_twin_prune_legacy },
+ { k_nbnxn_ewald_tab_twin_ener_legacy, k_nbnxn_ewald_tab_twin_ener_prune_legacy } },
};
/*! Return a pointer to the kernel version to be executed at the current step. */
if (NBNXN_KVER_LEGACY(kver))
{
+ /* no analytical Ewald with legacy kernels */
+ assert(eeltype <= eelCuEWALD_TAB_TWIN);
+
return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
}
else
cudaError_t stat;
for (int i = 0; i < eelCuNR; i++)
+ {
for (int j = 0; j < nEnergyKernelTypes; j++)
+ {
for (int k = 0; k < nPruneKernelTypes; k++)
{
- /* Legacy kernel 16/48 kB Shared/L1 */
- stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
- CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+ /* Legacy kernel 16/48 kB Shared/L1
+ * No analytical Ewald!
+ */
+ if (i != eelCuEWALD_ANA && i != eelCuEWALD_ANA_TWIN)
+ {
+ stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
+ CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+ }
if (devinfo->prop.major >= 3)
{
}
CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
}
+ }
+ }
}
ad->nalloc = -1;
}
+/*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
+ earlier GPUs, single or twin cut-off. */
+static int pick_ewald_kernel_type(bool bTwinCut,
+ const cuda_dev_info_t *dev_info)
+{
+ bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
+ int kernel_type;
+
+ /* Benchmarking/development environment variables to force the use of
+ analytical or tabulated Ewald kernel. */
+ bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
+ bForceTabulatedEwald = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
+
+ if (bForceAnalyticalEwald && bForceTabulatedEwald)
+ {
+ gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
+ "requested through environment variables.");
+ }
+
+ /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
+ if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+ {
+ bUseAnalyticalEwald = true;
+
+ if (debug)
+ {
+ fprintf(debug, "Using analytical Ewald CUDA kernels\n");
+ }
+ }
+ else
+ {
+ bUseAnalyticalEwald = false;
+
+ if (debug)
+ {
+ fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
+ }
+ }
+
+ /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
+ forces it (use it for debugging/benchmarking only). */
+ if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
+ {
+ kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
+ }
+ else
+ {
+ kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
+ }
+
+ return kernel_type;
+}
+
+
/*! Initializes the nonbonded parameter data structure. */
static void init_nbparam(cu_nbparam_t *nbp,
const interaction_const_t *ic,
- const nonbonded_verlet_t *nbv)
+ const nonbonded_verlet_t *nbv,
+ const cuda_dev_info_t *dev_info)
{
cudaError_t stat;
int ntypes, nnbfp;
}
else if ((EEL_PME(ic->eeltype) || ic->eeltype==eelEWALD))
{
- /* Initially rcoulomb == rvdw, so it's surely not twin cut-off, unless
- forced by the env. var. (used only for benchmarking). */
- if (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)
- {
- nbp->eeltype = eelCuEWALD;
- }
- else
- {
- nbp->eeltype = eelCuEWALD_TWIN;
- }
+ /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
+ nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
}
else
{
}
/* generate table for PME */
- if (nbp->eeltype == eelCuEWALD)
+ nbp->coulomb_tab = NULL;
+ if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
{
- nbp->coulomb_tab = NULL;
init_ewald_coulomb_force_table(nbp);
}
nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
nbp->ewald_beta = ic->ewaldcoeff;
- /* When switching to/from twin cut-off, the electrostatics type needs updating.
- (The env. var. that forces twin cut-off is for benchmarking only!) */
- if (ic->rcoulomb == ic->rvdw &&
- getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)
- {
- nbp->eeltype = eelCuEWALD;
- }
- else
- {
- nbp->eeltype = eelCuEWALD_TWIN;
- }
+ nbp->eeltype = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
+ cu_nb->dev_info);
init_ewald_coulomb_force_table(cu_nb->nbparam);
}
const nonbonded_verlet_t *nbv)
{
init_atomdata_first(cu_nb->atdat, nbv->grp[0].nbat->ntype);
- init_nbparam(cu_nb->nbparam, ic, nbv);
+ init_nbparam(cu_nb->nbparam, ic, nbv, cu_nb->dev_info);
/* clear energy and shift force outputs */
nbnxn_cuda_clear_e_fshift(cu_nb);
plist_nl = cu_nb->plist[eintNonlocal];
timers = cu_nb->timers;
- if (nbparam->eeltype == eelCuEWALD || nbparam->eeltype == eelCuEWALD_TWIN)
+ if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
{
stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
#define IATYPE_SHMEM
#endif
+#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define EL_EWALD_ANY
+#endif
+
/*
Kernel launch parameters:
- #blocks = #pair lists, blockId = pair list Id
#ifdef EL_RF
float two_k_rf = nbparam.two_k_rf;
#endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
float coulomb_tab_scale = nbparam.coulomb_tab_scale;
#endif
+#ifdef EL_EWALD_ANA
+ float beta2 = nbparam.ewald_beta*nbparam.ewald_beta;
+ float beta3 = nbparam.ewald_beta*nbparam.ewald_beta*nbparam.ewald_beta;
+#endif
#ifdef PRUNE_NBL
float rlist_sq = nbparam.rlist_sq;
#endif
#ifdef CALC_ENERGIES
float lj_shift = nbparam.sh_invrc6;
-#ifdef EL_EWALD
+#ifdef EL_EWALD_ANY
float beta = nbparam.ewald_beta;
float ewald_shift = nbparam.sh_ewald;
#else
E_lj = 0.0f;
E_el = 0.0f;
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_ANY || defined EL_RF
if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
{
/* we have the diagonal: add the charge self interaction energy term */
fcj_buf = make_float3(0.0f);
/* The PME and RF kernels don't unroll with CUDA <v4.1. */
-#if !defined PRUNE_NBL && !(CUDA_VERSION < 4010 && (defined EL_EWALD || defined EL_RF))
+#if !defined PRUNE_NBL && !(CUDA_VERSION < 4010 && (defined EL_EWALD_ANY || defined EL_RF))
#pragma unroll 8
#endif
for(i = 0; i < NCL_PER_SUPERCL; i++)
int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
/* cutoff & exclusion check */
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_ANY || defined EL_RF
if (r2 < rcoulomb_sq *
(nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
#else
inv_r = rsqrt(r2);
inv_r2 = inv_r * inv_r;
inv_r6 = inv_r2 * inv_r2 * inv_r2;
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_ANY || defined EL_RF
/* We could mask inv_r2, but with Ewald
* masking both inv_r6 and F_invr is faster */
inv_r6 *= int_bit;
#ifdef EL_RF
F_invr += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
#endif
-#ifdef EL_EWALD
+#if defined EL_EWALD_ANA
+ F_invr += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
+#elif defined EL_EWALD_TAB
F_invr += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r;
-#endif
+#endif /* EL_EWALD_ANA/TAB */
#ifdef CALC_ENERGIES
#ifdef EL_CUTOFF
#ifdef EL_RF
E_el += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
#endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_ANY
/* 1.0f - erff is faster than erfcf */
E_el += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
-#endif
+#endif /* EL_EWALD_ANY */
#endif
f_ij = rv * F_invr;
#endif
#endif
}
+
+#undef EL_EWALD_ANY
* code that is in double precision.
*/
+/* Analytical Ewald is not implemented for the legacy kernels (as it is anyway
+ slower than the tabulated kernel on Fermi). */
+#ifdef EL_EWALD_ANA
+#error Trying to generate Analytical Ewald legacy kernels which is not implemented in the legacy kernels!
+#endif
+
/*
Kernel launch parameters:
- #blocks = #pair lists, blockId = pair list Id
#ifdef EL_RF
float two_k_rf = nbparam.two_k_rf;
#endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
float coulomb_tab_scale = nbparam.coulomb_tab_scale;
#endif
#ifdef PRUNE_NBL
#ifdef CALC_ENERGIES
float lj_shift = nbparam.sh_invrc6;
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
float beta = nbparam.ewald_beta;
float ewald_shift = nbparam.sh_ewald;
#else
float qi, qj_f,
r2, inv_r, inv_r2, inv_r6,
c6, c12,
+ int_bit,
#ifdef CALC_ENERGIES
E_lj, E_el, E_lj_p,
#endif
F_invr;
- unsigned int wexcl, int_bit, imask, imask_j;
-#ifdef PRUNE_NBL
- unsigned int imask_prune;
-#endif
+ unsigned int wexcl, imask, mask_ji;
float4 xqbuf;
float3 xi, xj, rv, f_ij, fcj_buf, fshift_buf;
float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
E_lj = 0.0f;
E_el = 0.0f;
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_TAB || defined EL_RF
if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
{
/* we have the diagonal: add the charge self interaction energy term */
if (imask)
#endif
{
-#ifdef PRUNE_NBL
- imask_prune = imask;
-#endif
-
/* nvcc >v4.1 doesn't like this loop, it refuses to unroll it */
#if CUDA_VERSION >= 4010
#pragma unroll 4
#endif
for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
{
- imask_j = (imask >> (jm * CL_SIZE)) & supercl_interaction_mask;
- if (imask_j)
+ mask_ji = (imask >> (jm * CL_SIZE)) & supercl_interaction_mask;
+ if (mask_ji)
{
- nsubi = __popc(imask_j);
+ nsubi = __popc(mask_ji);
cj = pl_cj4[j4].cj[jm];
aj = cj * CL_SIZE + tidxj;
which is a pity because here unrolling could help. */
for (cii = 0; cii < nsubi; cii++)
{
- i = __ffs(imask_j) - 1;
- imask_j &= ~(1U << i);
+ i = __ffs(mask_ji) - 1;
+ mask_ji &= ~(1U << i);
ci_offset = i; /* i force buffer offset */
cluster-pair in imask gets set to 0. */
if (!__any(r2 < rlist_sq))
{
- imask_prune &= ~(1U << (jm * NCL_PER_SUPERCL + i));
+ imask &= ~(1U << (jm * NCL_PER_SUPERCL + i));
}
#endif
- int_bit = ((wexcl >> (jm * NCL_PER_SUPERCL + i)) & 1);
+ int_bit = ((wexcl >> (jm * NCL_PER_SUPERCL + i)) & 1) ? 1.0f : 0.0f;
/* cutoff & exclusion check */
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_TAB || defined EL_RF
if (r2 < rcoulomb_sq *
(nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
#else
inv_r = rsqrt(r2);
inv_r2 = inv_r * inv_r;
inv_r6 = inv_r2 * inv_r2 * inv_r2;
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_TAB || defined EL_RF
/* We could mask inv_r2, but with Ewald
* masking both inv_r6 and F_invr is faster */
inv_r6 *= int_bit;
F_invr = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
#ifdef CALC_ENERGIES
- E_lj_p = int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f);
+ E_lj_p = int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f);
#endif
#ifdef VDW_CUTOFF_CHECK
- /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
- vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
- F_invr *= vdw_in_range;
+ /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
+ vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+ F_invr *= vdw_in_range;
#ifdef CALC_ENERGIES
- E_lj_p *= vdw_in_range;
+ E_lj_p *= vdw_in_range;
#endif
#endif
#ifdef CALC_ENERGIES
- E_lj += E_lj_p;
+ E_lj += E_lj_p;
#endif
#ifdef EL_RF
F_invr += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
#endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
F_invr += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r;
-#endif
+#endif /* EL_EWALD_TAB */
#ifdef CALC_ENERGIES
#ifdef EL_CUTOFF
#ifdef EL_RF
E_el += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
#endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
/* 1.0f - erff is faster than erfcf */
E_el += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
#endif
#ifdef PRUNE_NBL
/* Update the imask with the new one which does not contain the
out of range clusters anymore. */
- pl_cj4[j4].imei[widx].imask = imask_prune;
+ pl_cj4[j4].imei[widx].imask = imask;
#endif
}
}
+ fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
}
+/*! Calculate analytical Ewald correction term. */
+static inline __device__
+float pmecorrF(float z2)
+{
+ const float FN6 = -1.7357322914161492954e-8f;
+ const float FN5 = 1.4703624142580877519e-6f;
+ const float FN4 = -0.000053401640219807709149f;
+ const float FN3 = 0.0010054721316683106153f;
+ const float FN2 = -0.019278317264888380590f;
+ const float FN1 = 0.069670166153766424023f;
+ const float FN0 = -0.75225204789749321333f;
+
+ const float FD4 = 0.0011193462567257629232f;
+ const float FD3 = 0.014866955030185295499f;
+ const float FD2 = 0.11583842382862377919f;
+ const float FD1 = 0.50736591960530292870f;
+ const float FD0 = 1.0f;
+
+ float z4;
+ float polyFN0,polyFN1,polyFD0,polyFD1;
+
+ z4 = z2*z2;
+
+ polyFD0 = FD4*z4 + FD2;
+ polyFD1 = FD3*z4 + FD1;
+ polyFD0 = polyFD0*z4 + FD0;
+ polyFD0 = polyFD1*z2 + polyFD0;
+
+ polyFD0 = 1.0f/polyFD0;
+
+ polyFN0 = FN6*z4 + FN4;
+ polyFN1 = FN5*z4 + FN3;
+ polyFN0 = polyFN0*z4 + FN2;
+ polyFN1 = polyFN1*z4 + FN1;
+ polyFN0 = polyFN0*z4 + FN0;
+ polyFN0 = polyFN1*z2 + polyFN0;
+
+ return polyFN0*polyFD0;
+}
+
/*! Final j-force reduction; this generic implementation works with
* arbitrary array sizes.
*/
*/
/*! \file
- * This header has the sole purpose of generating kernels for the different
- * type of electrostatics supported: Cut-off, Reaction-Field, and Ewald/PME;
- * the latter has a twin-range cut-off version (rcoul!=rvdw) which enables
- * PME tuning (otherwise in the Verlet scheme rcoul==rvdw).
+ * This header has the sole purpose of generating kernels for the supported
+ * electrostatics types: cut-off, reaction-field, Ewald, and tabulated Ewald.
*
- * (No include fence as it is meant to be included multiple times.)
+ * The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
+ * require an extra distance check to enable PP-PME load balancing
+ * (otherwise, by default rcoul == rvdw).
+ *
+ * NOTE: No include fence as it is meant to be included multiple times.
*/
-/* Cut-Off */
+/* Analytical plain cut-off kernels */
#define EL_CUTOFF
#define NB_KERNEL_FUNC_NAME(x,...) x##_cutoff##__VA_ARGS__
#include "nbnxn_cuda_kernel_legacy.cuh"
#undef EL_CUTOFF
#undef NB_KERNEL_FUNC_NAME
-/* Reaction-Field */
+/* Analytical reaction-field kernels */
#define EL_RF
#define NB_KERNEL_FUNC_NAME(x,...) x##_rf##__VA_ARGS__
#include "nbnxn_cuda_kernel_legacy.cuh"
#undef EL_RF
#undef NB_KERNEL_FUNC_NAME
-/* Ewald */
-#define EL_EWALD
+/* Analytical Ewald interaction kernels
+ * NOTE: no legacy kernels with analytical Ewald.
+ */
+#define EL_EWALD_ANA
#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald##__VA_ARGS__
-#include "nbnxn_cuda_kernel_legacy.cuh"
#include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD
+#undef EL_EWALD_ANA
#undef NB_KERNEL_FUNC_NAME
-/* Ewald with twin-range cut-off */
-#define EL_EWALD
+/* Analytical Ewald interaction kernels with twin-range cut-off
+ * NOTE: no legacy kernels with analytical Ewald.
+ */
+#define EL_EWALD_ANA
#define VDW_CUTOFF_CHECK
#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_twin##__VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef EL_EWALD_ANA
+#undef VDW_CUTOFF_CHECK
+#undef NB_KERNEL_FUNC_NAME
+
+/* Tabulated Ewald interaction kernels */
+#define EL_EWALD_TAB
+#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab##__VA_ARGS__
+#include "nbnxn_cuda_kernel_legacy.cuh"
+#include "nbnxn_cuda_kernel.cuh"
+#undef EL_EWALD_TAB
+#undef NB_KERNEL_FUNC_NAME
+
+/* Tabulated Ewald interaction kernels with twin-range cut-off */
+#define EL_EWALD_TAB
+#define VDW_CUTOFF_CHECK
+#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab_twin##__VA_ARGS__
#include "nbnxn_cuda_kernel_legacy.cuh"
#include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD
+#undef EL_EWALD_TAB
#undef VDW_CUTOFF_CHECK
#undef NB_KERNEL_FUNC_NAME
extern "C" {
#endif
-/*! Types of electrostatics available in the CUDA nonbonded force kernels. */
+/*! Types of electrostatics implementations available in the CUDA non-bonded
+ * force kernels. These represent both the electrostatics types implemented
+ * by the kernels (cut-off, RF, and Ewald - a subset of what's defined in
+ * enums.h) as well as encode implementation details analytical/tabulated
+ * and single or twin cut-off (for Ewald kernels).
+ * Note that the cut-off and RF kernels have only analytical flavor and unlike
+ * in the CPU kernels, the tabulated kernels are ATM Ewald-only.
+ *
+ * The order of pointers to different electrostatic kernels defined in
+ * nbnxn_cuda.cu by the nb_default_kfunc_ptr and nb_legacy_kfunc_ptr arrays
+ * should match the order of enumerated types below. */
enum {
- eelCuEWALD, eelCuEWALD_TWIN, eelCuRF, eelCuCUT, eelCuNR
+ eelCuCUT, eelCuRF, eelCuEWALD_TAB, eelCuEWALD_TAB_TWIN, eelCuEWALD_ANA, eelCuEWALD_ANA_TWIN, eelCuNR
};
+/*! Kernel flavors with different set of optimizations: default for CUDA <=v4.1
+ * compilers and legacy for earlier, 3.2 and 4.0 CUDA compilers. */
enum {
- eNbnxnCuKDefault, eNbnxnCuKLegacy, eNbnxnCuKOld, eNbnxnCuKNR
+ eNbnxnCuKDefault, eNbnxnCuKLegacy, eNbnxnCuKNR
};
#define NBNXN_KVER_OLD(k) (k == eNbnxnCuKOld)
GMX_BARRIER(cr->mpi_comm_mygroup);
}
+
if (inputrec->ePull == epullCONSTRAINT)
{
clear_pull_forces(inputrec->pull);
}
- /* update QMMMrec, if necessary */
- if (fr->bQMMM)
- {
- update_QMMMrec(cr, fr, x, mdatoms, box, top);
- }
-
- if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
- {
- posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
- f, enerd, lambda, fr);
- }
-
- /* Compute the bonded and non-bonded energies and optionally forces */
- do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
- cr, nrnb, wcycle, mdatoms, &(inputrec->opts),
- x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, mtop, top, fr->born,
- &(top->atomtypes), bBornRadii, box,
- inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
- flags, &cycles_pme);
-
- if (bSepLRF)
- {
- if (do_per_step(step, inputrec->nstcalclr))
- {
- /* Add the long range forces to the short range forces */
- for (i = 0; i < fr->natoms_force_constr; i++)
- {
- rvec_add(fr->f_twin[i], f[i], f[i]);
- }
- }
- }
+ /* We calculate the non-bonded forces, when done on the CPU, here.
+ * We do this before calling do_force_lowlevel, as in there bondeds
+ * forces are calculated before PME, which does communication.
+ * With this order, non-bonded and bonded force calculation imbalance
+ * can be balanced out by the domain decomposition load balancing.
+ */
if (!bUseOrEmulGPU)
{
nrnb, wcycle);
}
-
if (!bUseOrEmulGPU || bDiffKernels)
{
int aloc;
}
}
+ /* update QMMMrec, if necessary */
+ if (fr->bQMMM)
+ {
+ update_QMMMrec(cr, fr, x, mdatoms, box, top);
+ }
+
+ if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
+ {
+ posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+ f, enerd, lambda, fr);
+ }
+
+ /* Compute the bonded and non-bonded energies and optionally forces */
+ do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+ cr, nrnb, wcycle, mdatoms, &(inputrec->opts),
+ x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, mtop, top, fr->born,
+ &(top->atomtypes), bBornRadii, box,
+ inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
+ flags, &cycles_pme);
+
+ if (bSepLRF)
+ {
+ if (do_per_step(step, inputrec->nstcalclr))
+ {
+ /* Add the long range forces to the short range forces */
+ for (i = 0; i < fr->natoms_force_constr; i++)
+ {
+ rvec_add(fr->f_twin[i], f[i], f[i]);
+ }
+ }
+ }
+
cycles_force += wallcycle_stop(wcycle, ewcFORCE);
GMX_BARRIER(cr->mpi_comm_mygroup);
"REGRESSIONTEST_DOWNLOAD not supported with cmake ${CMAKE_VERSION}" FORCE)
endif()
if(REGRESSIONTEST_DOWNLOAD)
- if(NOT REGRESSIONTEST_VERSION)
- message(FATAL_ERROR "The configuration files do not specify what regressiontests tarball is suitable for automated download and testing. Please obtain and use a suitable set of tests yourself.")
+ if("${PROJECT_VERSION}" MATCHES "-dev")
+ if("${PROJECT_VERSION}" MATCHES "^5[.]")
+ set(REGRESSIONTEST_URL http://gerrit.gromacs.org/snapshot/refs/heads/master)
+ else()
+ set(REGRESSIONTEST_URL http://gerrit.gromacs.org/snapshot/refs/heads/release-4-6)
+ endif()
+ set(REGRESSIONTEST_PATH "${CMAKE_CURRENT_BINARY_DIR}/regressiontests"
+ CACHE PATH "Path to auto-downloaded regressiontests" FORCE)
+ else()
+ if(NOT REGRESSIONTEST_VERSION)
+ message(FATAL_ERROR "The configuration files do not specify what regressiontests tarball is suitable for automated download and testing. Please obtain and use a suitable set of tests yourself.")
+ endif()
+ set(REGRESSIONTEST_URL http://gerrit.gromacs.org/download/regressiontests-${REGRESSIONTEST_VERSION}.tar.gz)
+ set(REGRESSIONTEST_PATH
+ "${CMAKE_CURRENT_BINARY_DIR}/regressiontests-${REGRESSIONTEST_VERSION}"
+ CACHE PATH "Path to auto-downloaded regressiontests" FORCE)
endif()
- set(REGRESSIONTEST_URL
- http://gerrit.gromacs.org/download/regressiontests-${REGRESSIONTEST_VERSION}.tar.gz)
set(REGRESSIONTEST_FILE "${CMAKE_CURRENT_BINARY_DIR}/regressiontests.tgz")
message("Downloading: ${REGRESSIONTEST_URL}")
file(DOWNLOAD ${REGRESSIONTEST_URL} "${REGRESSIONTEST_FILE}" SHOW_PROGRESS STATUS status LOG log)
list(GET status 0 status_code)
list(GET status 1 status_string)
-
+
if(NOT status_code EQUAL 0)
message(FATAL_ERROR "error: downloading '${REGRESSIONTEST_URL}' failed
status_code: ${status_code}
log: ${log}")
endif()
- set(REGRESSIONTEST_PATH
- "${CMAKE_CURRENT_BINARY_DIR}/regressiontests-${REGRESSIONTEST_VERSION}"
- CACHE PATH "Path to auto-downloaded regressiontests" FORCE)
file(REMOVE_RECURSE "${REGRESSIONTEST_PATH}") #delete potential prior folder
execute_process(COMMAND ${CMAKE_COMMAND} -E tar xf "${REGRESSIONTEST_FILE}"
WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}")