Merge branch release-5-1 into release-2016
[alexxy/gromacs.git] / src / programs / mdrun / runner.cpp
index 7489b07dcbee6c6f730415ef166ed857afaffe4b..969a3d9513503547578f12bcbfa06c9a893b6c40 100644 (file)
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-
+/*! \internal \file
+ *
+ * \brief Implements the MD runner routine calling all integrators.
+ *
+ * \author David van der Spoel <david.vanderspoel@icm.uu.se>
+ * \ingroup module_mdlib
+ */
 #include "gmxpre.h"
 
+#include "runner.h"
+
 #include "config.h"
 
 #include <assert.h>
 
 #include <algorithm>
 
+#include "gromacs/commandline/filenm.h"
 #include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_struct.h"
 #include "gromacs/essentialdynamics/edsam.h"
 #include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/checkpoint.h"
+#include "gromacs/fileio/oenv.h"
 #include "gromacs/fileio/tpxio.h"
-#include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
-#include "gromacs/legacyheaders/checkpoint.h"
-#include "gromacs/legacyheaders/constr.h"
-#include "gromacs/legacyheaders/copyrite.h"
-#include "gromacs/legacyheaders/disre.h"
-#include "gromacs/legacyheaders/force.h"
-#include "gromacs/legacyheaders/gmx_detect_hardware.h"
-#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
-#include "gromacs/legacyheaders/gmx_thread_affinity.h"
-#include "gromacs/legacyheaders/inputrec.h"
-#include "gromacs/legacyheaders/main.h"
-#include "gromacs/legacyheaders/md_logging.h"
-#include "gromacs/legacyheaders/md_support.h"
-#include "gromacs/legacyheaders/mdatoms.h"
-#include "gromacs/legacyheaders/mdrun.h"
-#include "gromacs/legacyheaders/names.h"
-#include "gromacs/legacyheaders/network.h"
-#include "gromacs/legacyheaders/oenv.h"
-#include "gromacs/legacyheaders/orires.h"
-#include "gromacs/legacyheaders/qmmm.h"
-#include "gromacs/legacyheaders/sighandler.h"
-#include "gromacs/legacyheaders/txtdump.h"
-#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/gmxlib/md_logging.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/listed-forces/disre.h"
+#include "gromacs/listed-forces/orires.h"
 #include "gromacs/math/calculate-ewald-splitting-coefficient.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/utilities.h"
 #include "gromacs/math/vec.h"
 #include "gromacs/mdlib/calc_verletbuf.h"
+#include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/forcerec.h"
+#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/integrator.h"
+#include "gromacs/mdlib/main.h"
+#include "gromacs/mdlib/md_support.h"
+#include "gromacs/mdlib/mdatoms.h"
+#include "gromacs/mdlib/mdrun.h"
+#include "gromacs/mdlib/minimize.h"
 #include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/qmmm.h"
+#include "gromacs/mdlib/sighandler.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdlib/tpi.h"
+#include "gromacs/mdrunutility/threadaffinity.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/state.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/pulling/pull.h"
 #include "gromacs/pulling/pull_rotation.h"
-#include "gromacs/swap/swapcoords.h"
 #include "gromacs/timing/wallcycle.h"
 #include "gromacs/topology/mtop_util.h"
+#include "gromacs/trajectory/trajectoryframe.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/pleasecite.h"
 #include "gromacs/utility/smalloc.h"
 
 #include "deform.h"
-#include "membed.h"
+#include "md.h"
 #include "repl_ex.h"
 #include "resource-division.h"
 
 #include "corewrap.h"
 #endif
 
-typedef struct {
-    gmx_integrator_t *func;
-} gmx_intp_t;
-
-/* The array should match the eI array in include/types/enums.h */
-const gmx_intp_t    integrator[eiNR] = { {do_md}, {do_steep}, {do_cg}, {do_md}, {do_md}, {do_nm}, {do_lbfgs}, {do_tpi}, {do_tpi}, {do_md}, {do_md}, {do_md}};
-
+//! First step used in pressure scaling
 gmx_int64_t         deform_init_init_step_tpx;
+//! Initial box for pressure scaling
 matrix              deform_init_box_tpx;
+//! MPI variable for use in pressure scaling
 tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
 
-
-#ifdef GMX_THREAD_MPI
+#if GMX_THREAD_MPI
 /* The minimum number of atoms per tMPI thread. With fewer atoms than this,
  * the number of threads will get lowered.
  */
@@ -117,38 +133,38 @@ tMPI_Thread_mutex_t deform_init_box_mutex = TMPI_THREAD_MUTEX_INITIALIZER;
 
 struct mdrunner_arglist
 {
-    gmx_hw_opt_t    hw_opt;
-    FILE           *fplog;
-    t_commrec      *cr;
-    int             nfile;
-    const t_filenm *fnm;
-    output_env_t    oenv;
-    gmx_bool        bVerbose;
-    gmx_bool        bCompact;
-    int             nstglobalcomm;
-    ivec            ddxyz;
-    int             dd_node_order;
-    real            rdd;
-    real            rconstr;
-    const char     *dddlb_opt;
-    real            dlb_scale;
-    const char     *ddcsx;
-    const char     *ddcsy;
-    const char     *ddcsz;
-    const char     *nbpu_opt;
-    int             nstlist_cmdline;
-    gmx_int64_t     nsteps_cmdline;
-    int             nstepout;
-    int             resetstep;
-    int             nmultisim;
-    int             repl_ex_nst;
-    int             repl_ex_nex;
-    int             repl_ex_seed;
-    real            pforce;
-    real            cpt_period;
-    real            max_hours;
-    int             imdport;
-    unsigned long   Flags;
+    gmx_hw_opt_t            hw_opt;
+    FILE                   *fplog;
+    t_commrec              *cr;
+    int                     nfile;
+    const t_filenm         *fnm;
+    const gmx_output_env_t *oenv;
+    gmx_bool                bVerbose;
+    int                     nstglobalcomm;
+    ivec                    ddxyz;
+    int                     dd_rank_order;
+    int                     npme;
+    real                    rdd;
+    real                    rconstr;
+    const char             *dddlb_opt;
+    real                    dlb_scale;
+    const char             *ddcsx;
+    const char             *ddcsy;
+    const char             *ddcsz;
+    const char             *nbpu_opt;
+    int                     nstlist_cmdline;
+    gmx_int64_t             nsteps_cmdline;
+    int                     nstepout;
+    int                     resetstep;
+    int                     nmultisim;
+    int                     repl_ex_nst;
+    int                     repl_ex_nex;
+    int                     repl_ex_seed;
+    real                    pforce;
+    real                    cpt_period;
+    real                    max_hours;
+    int                     imdport;
+    unsigned long           Flags;
 };
 
 
@@ -157,44 +173,50 @@ struct mdrunner_arglist
    a commrec. */
 static void mdrunner_start_fn(void *arg)
 {
-    struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
-    struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
-                                            that it's thread-local. This doesn't
-                                            copy pointed-to items, of course,
-                                            but those are all const. */
-    t_commrec *cr;                       /* we need a local version of this */
-    FILE      *fplog = NULL;
-    t_filenm  *fnm;
+    try
+    {
+        struct mdrunner_arglist *mda = (struct mdrunner_arglist*)arg;
+        struct mdrunner_arglist  mc  = *mda; /* copy the arg list to make sure
+                                                that it's thread-local. This doesn't
+                                                copy pointed-to items, of course,
+                                                but those are all const. */
+        t_commrec *cr;                       /* we need a local version of this */
+        FILE      *fplog = NULL;
+        t_filenm  *fnm;
 
-    fnm = dup_tfn(mc.nfile, mc.fnm);
+        fnm = dup_tfn(mc.nfile, mc.fnm);
 
-    cr = reinitialize_commrec_for_this_thread(mc.cr);
+        cr = reinitialize_commrec_for_this_thread(mc.cr);
 
-    if (MASTER(cr))
-    {
-        fplog = mc.fplog;
-    }
+        if (MASTER(cr))
+        {
+            fplog = mc.fplog;
+        }
 
-    mdrunner(&mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
-             mc.bVerbose, mc.bCompact, mc.nstglobalcomm,
-             mc.ddxyz, mc.dd_node_order, mc.rdd,
-             mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
-             mc.ddcsx, mc.ddcsy, mc.ddcsz,
-             mc.nbpu_opt, mc.nstlist_cmdline,
-             mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
-             mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
-             mc.cpt_period, mc.max_hours, mc.imdport, mc.Flags);
+        gmx::mdrunner(&mc.hw_opt, fplog, cr, mc.nfile, fnm, mc.oenv,
+                      mc.bVerbose, mc.nstglobalcomm,
+                      mc.ddxyz, mc.dd_rank_order, mc.npme, mc.rdd,
+                      mc.rconstr, mc.dddlb_opt, mc.dlb_scale,
+                      mc.ddcsx, mc.ddcsy, mc.ddcsz,
+                      mc.nbpu_opt, mc.nstlist_cmdline,
+                      mc.nsteps_cmdline, mc.nstepout, mc.resetstep,
+                      mc.nmultisim, mc.repl_ex_nst, mc.repl_ex_nex, mc.repl_ex_seed, mc.pforce,
+                      mc.cpt_period, mc.max_hours, mc.imdport, mc.Flags);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
 }
 
+
 /* called by mdrunner() to start a specific number of threads (including
    the main thread) for thread-parallel runs. This in turn calls mdrunner()
    for each thread.
    All options besides nthreads are the same as for mdrunner(). */
 static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
                                          FILE *fplog, t_commrec *cr, int nfile,
-                                         const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
-                                         gmx_bool bCompact, int nstglobalcomm,
-                                         ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+                                         const t_filenm fnm[], const gmx_output_env_t *oenv, gmx_bool bVerbose,
+                                         int nstglobalcomm,
+                                         ivec ddxyz, int dd_rank_order, int npme,
+                                         real rdd, real rconstr,
                                          const char *dddlb_opt, real dlb_scale,
                                          const char *ddcsx, const char *ddcsy, const char *ddcsz,
                                          const char *nbpu_opt, int nstlist_cmdline,
@@ -228,12 +250,12 @@ static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
     mda->fnm             = fnmn;
     mda->oenv            = oenv;
     mda->bVerbose        = bVerbose;
-    mda->bCompact        = bCompact;
     mda->nstglobalcomm   = nstglobalcomm;
     mda->ddxyz[XX]       = ddxyz[XX];
     mda->ddxyz[YY]       = ddxyz[YY];
     mda->ddxyz[ZZ]       = ddxyz[ZZ];
-    mda->dd_node_order   = dd_node_order;
+    mda->dd_rank_order   = dd_rank_order;
+    mda->npme            = npme;
     mda->rdd             = rdd;
     mda->rconstr         = rconstr;
     mda->dddlb_opt       = dddlb_opt;
@@ -271,12 +293,15 @@ static t_commrec *mdrunner_start_threads(gmx_hw_opt_t *hw_opt,
 #endif /* GMX_THREAD_MPI */
 
 
-/* We determine the extra cost of the non-bonded kernels compared to
+/*! \brief Cost of non-bonded kernels
+ *
+ * We determine the extra cost of the non-bonded kernels compared to
  * a reference nstlist value of 10 (which is the default in grompp).
  */
 static const int    nbnxnReferenceNstlist = 10;
-/* The values to try when switching  */
+//! The values to try when switching
 const int           nstlist_try[] = { 20, 25, 40 };
+//! Number of elements in the neighborsearch list trials.
 #define NNSTL  sizeof(nstlist_try)/sizeof(nstlist_try[0])
 /* Increase nstlist until the non-bonded cost increases more than listfac_ok,
  * but never more than listfac_max.
@@ -288,17 +313,26 @@ const int           nstlist_try[] = { 20, 25, 40 };
  * nstlist will not be increased enough to reach optimal performance.
  */
 /* CPU: pair-search is about a factor 1.5 slower than the non-bonded kernel */
+//! Max OK performance ratio beween force calc and neighbor searching
 static const float  nbnxn_cpu_listfac_ok    = 1.05;
+//! Too high performance ratio beween force calc and neighbor searching
 static const float  nbnxn_cpu_listfac_max   = 1.09;
+/* CPU: pair-search is about a factor 2-3 slower than the non-bonded kernel */
+//! Max OK performance ratio beween force calc and neighbor searching
+static const float  nbnxn_knl_listfac_ok    = 1.22;
+//! Too high performance ratio beween force calc and neighbor searching
+static const float  nbnxn_knl_listfac_max   = 1.3;
 /* GPU: pair-search is a factor 1.5-3 slower than the non-bonded kernel */
+//! Max OK performance ratio beween force calc and neighbor searching
 static const float  nbnxn_gpu_listfac_ok    = 1.20;
+//! Too high performance ratio beween force calc and neighbor searching
 static const float  nbnxn_gpu_listfac_max   = 1.30;
 
-/* Try to increase nstlist when using the Verlet cut-off scheme */
+/*! \brief Try to increase nstlist when using the Verlet cut-off scheme */
 static void increase_nstlist(FILE *fp, t_commrec *cr,
                              t_inputrec *ir, int nstlist_cmdline,
                              const gmx_mtop_t *mtop, matrix box,
-                             gmx_bool bGPU)
+                             gmx_bool bGPU, const gmx::CpuInfo &cpuinfo)
 {
     float                  listfac_ok, listfac_max;
     int                    nstlist_orig, nstlist_prev;
@@ -314,7 +348,6 @@ static void increase_nstlist(FILE *fp, t_commrec *cr,
     const char            *box_err  = "Can not increase nstlist because the box is too small";
     const char            *dd_err   = "Can not increase nstlist because of domain decomposition limitations";
     char                   buf[STRLEN];
-    const float            oneThird = 1.0f / 3.0f;
 
     if (nstlist_cmdline <= 0)
     {
@@ -380,6 +413,11 @@ static void increase_nstlist(FILE *fp, t_commrec *cr,
         listfac_ok  = nbnxn_gpu_listfac_ok;
         listfac_max = nbnxn_gpu_listfac_max;
     }
+    else if (cpuinfo.feature(gmx::CpuInfo::Feature::X86_Avx512ER))
+    {
+        listfac_ok  = nbnxn_knl_listfac_ok;
+        listfac_max = nbnxn_knl_listfac_max;
+    }
     else
     {
         listfac_ok  = nbnxn_cpu_listfac_ok;
@@ -411,8 +449,8 @@ static void increase_nstlist(FILE *fp, t_commrec *cr,
     /* Determine the pair list size increase due to zero interactions */
     rlist_inc = nbnxn_get_rlist_effective_inc(ls.cluster_size_j,
                                               mtop->natoms/det(box));
-    rlist_ok  = (rlistWithReferenceNstlist + rlist_inc)*pow(listfac_ok, oneThird) - rlist_inc;
-    rlist_max = (rlistWithReferenceNstlist + rlist_inc)*pow(listfac_max, oneThird) - rlist_inc;
+    rlist_ok  = (rlistWithReferenceNstlist + rlist_inc)*std::cbrt(listfac_ok) - rlist_inc;
+    rlist_max = (rlistWithReferenceNstlist + rlist_inc)*std::cbrt(listfac_max) - rlist_inc;
     if (debug)
     {
         fprintf(debug, "nstlist tuning: rlist_inc %.3f rlist_ok %.3f rlist_max %.3f\n",
@@ -432,7 +470,7 @@ static void increase_nstlist(FILE *fp, t_commrec *cr,
         calc_verlet_buffer_size(mtop, det(box), ir, -1, &ls, NULL, &rlist_new);
 
         /* Does rlist fit in the box? */
-        bBox = (sqr(rlist_new) < max_cutoff2(ir->ePBC, box));
+        bBox = (gmx::square(rlist_new) < max_cutoff2(ir->ePBC, box));
         bDD  = TRUE;
         if (bBox && DOMAINDECOMP(cr))
         {
@@ -499,17 +537,18 @@ static void increase_nstlist(FILE *fp, t_commrec *cr,
             fprintf(fp, "%s\n\n", buf);
         }
         ir->rlist     = rlist_new;
-        ir->rlistlong = rlist_new;
     }
 }
 
+/*! \brief Initialize variables for Verlet scheme simulation */
 static void prepare_verlet_scheme(FILE                           *fplog,
                                   t_commrec                      *cr,
                                   t_inputrec                     *ir,
                                   int                             nstlist_cmdline,
                                   const gmx_mtop_t               *mtop,
                                   matrix                          box,
-                                  gmx_bool                        bUseGPU)
+                                  gmx_bool                        bUseGPU,
+                                  const gmx::CpuInfo             &cpuinfo)
 {
     /* For NVE simulations, we will retain the initial list buffer */
     if (EI_DYNAMICS(ir->eI) &&
@@ -537,7 +576,6 @@ static void prepare_verlet_scheme(FILE                           *fplog,
                         ls.cluster_size_i, ls.cluster_size_j);
             }
             ir->rlist     = rlist_new;
-            ir->rlistlong = rlist_new;
         }
     }
 
@@ -550,11 +588,14 @@ static void prepare_verlet_scheme(FILE                           *fplog,
     if (EI_DYNAMICS(ir->eI))
     {
         /* Set or try nstlist values */
-        increase_nstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, bUseGPU);
+        increase_nstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, bUseGPU, cpuinfo);
     }
 }
 
-/* Override the value in inputrec with value passed on the command line (if any) */
+/*! \brief Override the nslist value in inputrec
+ *
+ * with value passed on the command line (if any)
+ */
 static void override_nsteps_cmdline(FILE            *fplog,
                                     gmx_int64_t      nsteps_cmdline,
                                     t_inputrec      *ir,
@@ -592,11 +633,51 @@ static void override_nsteps_cmdline(FILE            *fplog,
     /* Do nothing if nsteps_cmdline == -2 */
 }
 
+namespace gmx
+{
+
+//! \brief Return the correct integrator function.
+static integrator_t *my_integrator(unsigned int ei)
+{
+    switch (ei)
+    {
+        case eiMD:
+        case eiBD:
+        case eiSD1:
+        case eiVV:
+        case eiVVAK:
+            if (!EI_DYNAMICS(ei))
+            {
+                GMX_THROW(APIError("do_md integrator would be called for a non-dynamical integrator"));
+            }
+            return do_md;
+        case eiSteep:
+            return do_steep;
+        case eiCG:
+            return do_cg;
+        case eiNM:
+            return do_nm;
+        case eiLBFGS:
+            return do_lbfgs;
+        case eiTPI:
+        case eiTPIC:
+            if (!EI_TPI(ei))
+            {
+                GMX_THROW(APIError("do_tpi integrator would be called for a non-TPI integrator"));
+            }
+            return do_tpi;
+        case eiSD2_REMOVED:
+            GMX_THROW(NotImplementedError("SD2 integrator has been removed"));
+        default:
+            GMX_THROW(APIError("Non existing integrator selected"));
+    }
+}
+
 int mdrunner(gmx_hw_opt_t *hw_opt,
              FILE *fplog, t_commrec *cr, int nfile,
-             const t_filenm fnm[], const output_env_t oenv, gmx_bool bVerbose,
-             gmx_bool bCompact, int nstglobalcomm,
-             ivec ddxyz, int dd_node_order, real rdd, real rconstr,
+             const t_filenm fnm[], const gmx_output_env_t *oenv, gmx_bool bVerbose,
+             int nstglobalcomm,
+             ivec ddxyz, int dd_rank_order, int npme, real rdd, real rconstr,
              const char *dddlb_opt, real dlb_scale,
              const char *ddcsx, const char *ddcsy, const char *ddcsz,
              const char *nbpu_opt, int nstlist_cmdline,
@@ -623,17 +704,14 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     gmx_constr_t              constr;
     int                       nChargePerturbed = -1, nTypePerturbed = 0, status;
     gmx_wallcycle_t           wcycle;
-    gmx_bool                  bReadEkin;
     gmx_walltime_accounting_t walltime_accounting = NULL;
     int                       rc;
     gmx_int64_t               reset_counters;
     gmx_edsam_t               ed           = NULL;
     int                       nthreads_pme = 1;
-    int                       nthreads_pp  = 1;
-    gmx_membed_t              membed       = NULL;
     gmx_hw_info_t            *hwinfo       = NULL;
     /* The master rank decides early on bUseGPU and broadcasts this later */
-    gmx_bool                  bUseGPU      = FALSE;
+    gmx_bool                  bUseGPU            = FALSE;
 
     /* CAUTION: threads may be started later on in this function, so
        cr doesn't reflect the final parallel state right now */
@@ -671,7 +749,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     if (SIMMASTER(cr))
     {
         /* Read (nearly) all data required for the simulation */
-        read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, state, NULL, mtop);
+        read_tpx_state(ftp2fn(efTPR, nfile, fnm), inputrec, state, mtop);
 
         if (inputrec->cutoff_scheme == ecutsVERLET)
         {
@@ -697,7 +775,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
 
             prepare_verlet_scheme(fplog, cr,
                                   inputrec, nstlist_cmdline, mtop, state->box,
-                                  bUseGPU);
+                                  bUseGPU, *hwinfo->cpuInfo);
         }
         else
         {
@@ -718,34 +796,26 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
                 gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
             }
 
-#ifdef GMX_TARGET_BGQ
+#if GMX_TARGET_BGQ
             md_print_warn(cr, fplog,
                           "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
                           "      BlueGene/Q. You will observe better performance from using the\n"
                           "      Verlet cut-off scheme.\n");
 #endif
         }
-
-        if (inputrec->eI == eiSD2)
-        {
-            md_print_warn(cr, fplog, "The stochastic dynamics integrator %s is deprecated, since\n"
-                          "it is slower than integrator %s and is slightly less accurate\n"
-                          "with constraints. Use the %s integrator.",
-                          ei_names[inputrec->eI], ei_names[eiSD1], ei_names[eiSD1]);
-        }
     }
 
     /* Check and update the hardware options for internal consistency */
-    check_and_update_hw_opt_1(hw_opt, cr);
+    check_and_update_hw_opt_1(hw_opt, cr, npme);
 
     /* Early check for externally set process affinity. */
     gmx_check_thread_affinity_set(fplog, cr,
                                   hw_opt, hwinfo->nthreads_hw_avail, FALSE);
 
-#ifdef GMX_THREAD_MPI
+#if GMX_THREAD_MPI
     if (SIMMASTER(cr))
     {
-        if (cr->npmenodes > 0 && hw_opt->nthreads_tmpi <= 0)
+        if (npme > 0 && hw_opt->nthreads_tmpi <= 0)
         {
             gmx_fatal(FARGS, "You need to explicitly specify the number of MPI threads (-ntmpi) when using separate PME ranks");
         }
@@ -767,8 +837,8 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
             t_commrec *cr_old       = cr;
             /* now start the threads. */
             cr = mdrunner_start_threads(hw_opt, fplog, cr_old, nfile, fnm,
-                                        oenv, bVerbose, bCompact, nstglobalcomm,
-                                        ddxyz, dd_node_order, rdd, rconstr,
+                                        oenv, bVerbose, nstglobalcomm,
+                                        ddxyz, dd_rank_order, npme, rdd, rconstr,
                                         dddlb_opt, dlb_scale, ddcsx, ddcsy, ddcsz,
                                         nbpu_opt, nstlist_cmdline,
                                         nsteps_cmdline, nstepout, resetstep, nmultisim,
@@ -786,18 +856,6 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
 #endif
     /* END OF CAUTION: cr is now reliable */
 
-    /* g_membed initialisation *
-     * Because we change the mtop, init_membed is called before the init_parallel *
-     * (in case we ever want to make it run in parallel) */
-    if (opt2bSet("-membed", nfile, fnm))
-    {
-        if (MASTER(cr))
-        {
-            fprintf(stderr, "Initializing membed");
-        }
-        membed = init_membed(fplog, nfile, fnm, mtop, inputrec, state, cr, &cpt_period);
-    }
-
     if (PAR(cr))
     {
         /* now broadcast everything to the non-master nodes/threads: */
@@ -821,14 +879,14 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     /* A parallel command line option consistency check that we can
        only do after any threads have started. */
     if (!PAR(cr) &&
-        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || cr->npmenodes > 0))
+        (ddxyz[XX] > 1 || ddxyz[YY] > 1 || ddxyz[ZZ] > 1 || npme > 0))
     {
         gmx_fatal(FARGS,
                   "The -dd or -npme option request a parallel simulation, "
-#ifndef GMX_MPI
+#if !GMX_MPI
                   "but %s was compiled without threads or MPI enabled"
 #else
-#ifdef GMX_THREAD_MPI
+#if GMX_THREAD_MPI
                   "but the number of MPI-threads (option -ntmpi) is not set or is 1"
 #else
                   "but %s was not started through mpirun/mpiexec or only one rank was requested through mpirun/mpiexec"
@@ -851,22 +909,22 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
 
     if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
     {
-        if (cr->npmenodes > 0)
+        if (npme > 0)
         {
-            gmx_fatal_collective(FARGS, cr, NULL,
+            gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
                                  "PME-only ranks are requested, but the system does not use PME for electrostatics or LJ");
         }
 
-        cr->npmenodes = 0;
+        npme = 0;
     }
 
-    if (bUseGPU && cr->npmenodes < 0)
+    if (bUseGPU && npme < 0)
     {
         /* With GPUs we don't automatically use PME-only ranks. PME ranks can
          * improve performance with many threads per GPU, since our OpenMP
          * scaling is bad, but it's difficult to automate the setup.
          */
-        cr->npmenodes = 0;
+        npme = 0;
     }
 
 #ifdef GMX_FAHCORE
@@ -891,7 +949,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     init_orires(fplog, mtop, state->x, inputrec, cr, &(fcd->orires),
                 state);
 
-    if (DEFORM(*inputrec))
+    if (inputrecDeform(inputrec))
     {
         /* Store the deform reference box before reading the checkpoint */
         if (SIMMASTER(cr))
@@ -914,23 +972,22 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
         tMPI_Thread_mutex_unlock(&deform_init_box_mutex);
     }
 
-    if (opt2bSet("-cpi", nfile, fnm))
+    if (Flags & MD_STARTFROMCPT)
     {
         /* Check if checkpoint file exists before doing continuation.
          * This way we can use identical input options for the first and subsequent runs...
          */
-        if (gmx_fexist_master(opt2fn_master("-cpi", nfile, fnm, cr), cr) )
-        {
-            load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
-                            cr, ddxyz,
-                            inputrec, state, &bReadEkin,
-                            (Flags & MD_APPENDFILES),
-                            (Flags & MD_APPENDFILESSET));
+        gmx_bool bReadEkin;
 
-            if (bReadEkin)
-            {
-                Flags |= MD_READ_EKIN;
-            }
+        load_checkpoint(opt2fn_master("-cpi", nfile, fnm, cr), &fplog,
+                        cr, ddxyz, &npme,
+                        inputrec, state, &bReadEkin,
+                        (Flags & MD_APPENDFILES),
+                        (Flags & MD_APPENDFILESSET));
+
+        if (bReadEkin)
+        {
+            Flags |= MD_READ_EKIN;
         }
     }
 
@@ -953,6 +1010,9 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
         gmx_bcast(sizeof(box), box, cr);
     }
 
+    // TODO This should move to do_md(), because it only makes sense
+    // with dynamical integrators, but there is no test coverage and
+    // it interacts with constraints, somehow.
     /* Essential dynamics */
     if (opt2bSet("-ei", nfile, fnm))
     {
@@ -963,17 +1023,14 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     if (PAR(cr) && !(EI_TPI(inputrec->eI) ||
                      inputrec->eI == eiNM))
     {
-        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, rdd, rconstr,
+        cr->dd = init_domain_decomposition(fplog, cr, Flags, ddxyz, npme,
+                                           dd_rank_order,
+                                           rdd, rconstr,
                                            dddlb_opt, dlb_scale,
                                            ddcsx, ddcsy, ddcsz,
                                            mtop, inputrec,
                                            box, state->x,
                                            &ddbox, &npme_major, &npme_minor);
-
-        make_dd_communicators(fplog, cr, dd_node_order);
-
-        /* Set overallocation to avoid frequent reallocation of arrays */
-        set_over_alloc_dd(TRUE);
     }
     else
     {
@@ -1001,7 +1058,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
 
     /* Initialize per-physical-node MPI process/thread ID and counters. */
     gmx_init_intranode_counters(cr);
-#ifdef GMX_MPI
+#if GMX_MPI
     if (MULTISIM(cr))
     {
         md_print_info(cr, fplog,
@@ -1011,7 +1068,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     }
     md_print_info(cr, fplog, "Using %d MPI %s\n",
                   cr->nnodes,
-#ifdef GMX_THREAD_MPI
+#if GMX_THREAD_MPI
                   cr->nnodes == 1 ? "thread" : "threads"
 #else
                   cr->nnodes == 1 ? "process" : "processes"
@@ -1034,7 +1091,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
                           inputrec->cutoff_scheme == ecutsVERLET);
 
 #ifndef NDEBUG
-    if (integrator[inputrec->eI].func != do_tpi &&
+    if (EI_TPI(inputrec->eI) &&
         inputrec->cutoff_scheme == ecutsVERLET)
     {
         gmx_feenableexcept();
@@ -1071,13 +1128,9 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
        PME: env variable should be read only on one node to make sure it is
        identical everywhere;
      */
-    /* TODO nthreads_pp is only used for pinning threads.
-     * This is a temporary solution until we have a hw topology library.
-     */
-    nthreads_pp  = gmx_omp_nthreads_get(emntNonbonded);
     nthreads_pme = gmx_omp_nthreads_get(emntPME);
 
-    wcycle = wallcycle_init(fplog, resetstep, cr, nthreads_pp, nthreads_pme);
+    wcycle = wallcycle_init(fplog, resetstep, cr);
 
     if (PAR(cr))
     {
@@ -1097,20 +1150,14 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
         fr          = mk_forcerec();
         fr->hwinfo  = hwinfo;
         fr->gpu_opt = &hw_opt->gpu_opt;
-        init_forcerec(fplog, oenv, fr, fcd, inputrec, mtop, cr, box,
+        init_forcerec(fplog, fr, fcd, inputrec, mtop, cr, box,
                       opt2fn("-table", nfile, fnm),
-                      opt2fn("-tabletf", nfile, fnm),
                       opt2fn("-tablep", nfile, fnm),
                       getFilenm("-tableb", nfile, fnm),
                       nbpu_opt,
                       FALSE,
                       pforce);
 
-        /* version for PCA_NOT_READ_NODE (see md.c) */
-        /*init_forcerec(fplog,fr,fcd,inputrec,mtop,cr,box,FALSE,
-           "nofile","nofile","nofile","nofile",FALSE,pforce);
-         */
-
         /* Initialize QM-MM */
         if (fr->bQMMM)
         {
@@ -1217,7 +1264,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     }
 
 
-    if (integrator[inputrec->eI].func == do_md)
+    if (EI_DYNAMICS(inputrec->eI))
     {
         /* Turn on signal handling on all nodes */
         /*
@@ -1244,53 +1291,44 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
         if (inputrec->bRot)
         {
             /* Initialize enforced rotation code */
-            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, box, mtop, oenv,
+            init_rot(fplog, inputrec, nfile, fnm, cr, state->x, state->box, mtop, oenv,
                      bVerbose, Flags);
         }
 
-        if (inputrec->eSwapCoords != eswapNO)
-        {
-            /* Initialize ion swapping code */
-            init_swapcoords(fplog, bVerbose, inputrec, opt2fn_master("-swap", nfile, fnm, cr),
-                            mtop, state->x, state->box, &state->swapstate, cr, oenv, Flags);
-        }
-
         constr = init_constraints(fplog, mtop, inputrec, ed, state, cr);
 
         if (DOMAINDECOMP(cr))
         {
             GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
+            /* This call is not included in init_domain_decomposition mainly
+             * because fr->cginfo_mb is set later.
+             */
             dd_init_bondeds(fplog, cr->dd, mtop, vsite, inputrec,
                             Flags & MD_DDBONDCHECK, fr->cginfo_mb);
-
-            set_dd_parameters(fplog, cr->dd, dlb_scale, inputrec, &ddbox);
-
-            setup_dd_grid(fplog, cr->dd);
         }
 
         /* Now do whatever the user wants us to do (how flexible...) */
-        integrator[inputrec->eI].func(fplog, cr, nfile, fnm,
-                                      oenv, bVerbose, bCompact,
-                                      nstglobalcomm,
-                                      vsite, constr,
-                                      nstepout, inputrec, mtop,
-                                      fcd, state,
-                                      mdatoms, nrnb, wcycle, ed, fr,
-                                      repl_ex_nst, repl_ex_nex, repl_ex_seed,
-                                      membed,
-                                      cpt_period, max_hours,
-                                      imdport,
-                                      Flags,
-                                      walltime_accounting);
+        my_integrator(inputrec->eI) (fplog, cr, nfile, fnm,
+                                     oenv, bVerbose,
+                                     nstglobalcomm,
+                                     vsite, constr,
+                                     nstepout, inputrec, mtop,
+                                     fcd, state,
+                                     mdatoms, nrnb, wcycle, ed, fr,
+                                     repl_ex_nst, repl_ex_nex, repl_ex_seed,
+                                     cpt_period, max_hours,
+                                     imdport,
+                                     Flags,
+                                     walltime_accounting);
 
-        if (inputrec->bPull)
+        if (inputrec->bRot)
         {
-            finish_pull(inputrec->pull_work);
+            finish_rot(inputrec->rot);
         }
 
-        if (inputrec->bRot)
+        if (inputrec->bPull)
         {
-            finish_rot(inputrec->rot);
+            finish_pull(inputrec->pull_work);
         }
 
     }
@@ -1316,11 +1354,6 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
     /* Free GPU memory and context */
     free_gpu_resources(fr, cr, &hwinfo->gpu_info, fr ? fr->gpu_opt : NULL);
 
-    if (opt2bSet("-membed", nfile, fnm))
-    {
-        sfree(membed);
-    }
-
     gmx_hardware_info_free(hwinfo);
 
     /* Does what it says */
@@ -1337,7 +1370,7 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
 
     done_ed(&ed);
 
-#ifdef GMX_THREAD_MPI
+#if GMX_THREAD_MPI
     /* we need to join all threads. The sub-threads join when they
        exit this function, but the master thread needs to be told to
        wait for that. */
@@ -1349,3 +1382,5 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
 
     return rc;
 }
+
+} // namespace gmx