Merge release-5-0 into master
[alexxy/gromacs.git] / src / programs / mdrun / md.cpp
similarity index 65%
rename from src/programs/mdrun/md.c
rename to src/programs/mdrun/md.cpp
index 367179241f321d34ad7298923ec8e3296b1a9b9c..060595cb18d47b04fc14bbf244de7385e8b4343d 100644 (file)
  * To help us fund GROMACS development, we humbly ask that you cite
  * the research papers on the package. Check out http://www.gromacs.org.
  */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
 
-#include "typedefs.h"
-#include "gromacs/utility/smalloc.h"
-#include "sysstuff.h"
-#include "vec.h"
-#include "vcm.h"
-#include "mdebin.h"
-#include "nrnb.h"
-#include "calcmu.h"
-#include "index.h"
-#include "vsite.h"
-#include "update.h"
-#include "ns.h"
-#include "mdrun.h"
-#include "md_support.h"
-#include "md_logging.h"
-#include "network.h"
-#include "xvgr.h"
-#include "physics.h"
-#include "names.h"
-#include "force.h"
-#include "disre.h"
-#include "orires.h"
-#include "pme.h"
-#include "mdatoms.h"
-#include "repl_ex.h"
-#include "deform.h"
-#include "qmmm.h"
-#include "domdec.h"
-#include "domdec_network.h"
-#include "gromacs/gmxlib/topsort.h"
-#include "coulomb.h"
-#include "constr.h"
-#include "shellfc.h"
-#include "gromacs/gmxpreprocess/compute_io.h"
-#include "checkpoint.h"
-#include "mtop_util.h"
-#include "sighandler.h"
-#include "txtdump.h"
-#include "gromacs/utility/cstringutil.h"
-#include "pme_loadbal.h"
-#include "bondf.h"
-#include "membed.h"
-#include "types/nlistheuristics.h"
-#include "types/iteratedconstraints.h"
-#include "nbnxn_cuda_data_mgmt.h"
+#include "config.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "thread_mpi/threads.h"
 
-#include "gromacs/utility/gmxmpi.h"
-#include "gromacs/fileio/confio.h"
+#include "gromacs/domdec/domdec.h"
+#include "gromacs/domdec/domdec_network.h"
+#include "gromacs/ewald/pme-load-balancing.h"
+#include "gromacs/ewald/pme.h"
+#include "gromacs/fileio/filenm.h"
+#include "gromacs/fileio/mdoutf.h"
 #include "gromacs/fileio/trajectory_writing.h"
-#include "gromacs/fileio/trnio.h"
+#include "gromacs/fileio/trx.h"
 #include "gromacs/fileio/trxio.h"
-#include "gromacs/fileio/xtcio.h"
-#include "gromacs/timing/wallcycle.h"
-#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/legacyheaders/constr.h"
+#include "gromacs/legacyheaders/ebin.h"
+#include "gromacs/legacyheaders/force.h"
+#include "gromacs/legacyheaders/md_logging.h"
+#include "gromacs/legacyheaders/md_support.h"
+#include "gromacs/legacyheaders/mdatoms.h"
+#include "gromacs/legacyheaders/mdebin.h"
+#include "gromacs/legacyheaders/mdrun.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/ns.h"
+#include "gromacs/legacyheaders/shellfc.h"
+#include "gromacs/legacyheaders/sighandler.h"
+#include "gromacs/legacyheaders/sim_util.h"
+#include "gromacs/legacyheaders/tgroup.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/update.h"
+#include "gromacs/legacyheaders/vcm.h"
+#include "gromacs/legacyheaders/vsite.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/legacyheaders/types/constr.h"
+#include "gromacs/legacyheaders/types/enums.h"
+#include "gromacs/legacyheaders/types/fcdata.h"
+#include "gromacs/legacyheaders/types/force_flags.h"
+#include "gromacs/legacyheaders/types/forcerec.h"
+#include "gromacs/legacyheaders/types/group.h"
+#include "gromacs/legacyheaders/types/inputrec.h"
+#include "gromacs/legacyheaders/types/interaction_const.h"
+#include "gromacs/legacyheaders/types/mdatom.h"
+#include "gromacs/legacyheaders/types/membedt.h"
+#include "gromacs/legacyheaders/types/nrnb.h"
+#include "gromacs/legacyheaders/types/oenv.h"
+#include "gromacs/legacyheaders/types/shellfc.h"
+#include "gromacs/legacyheaders/types/state.h"
+#include "gromacs/listed-forces/manage-threading.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdlib/compute_io.h"
+#include "gromacs/mdlib/mdrun_signalling.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
 #include "gromacs/pulling/pull.h"
 #include "gromacs/swap/swapcoords.h"
-#include "gromacs/imd/imd.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/topology/atoms.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/topology/topology.h"
+#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/real.h"
+#include "gromacs/utility/smalloc.h"
+
+#include "deform.h"
+#include "membed.h"
+#include "repl_ex.h"
 
 #ifdef GMX_FAHCORE
 #include "corewrap.h"
@@ -105,7 +125,7 @@ static void reset_all_counters(FILE *fplog, t_commrec *cr,
                                gmx_int64_t *step_rel, t_inputrec *ir,
                                gmx_wallcycle_t wcycle, t_nrnb *nrnb,
                                gmx_walltime_accounting_t walltime_accounting,
-                               nbnxn_cuda_ptr_t cu_nbv)
+                               struct nonbonded_verlet_t *nbv)
 {
     char sbuf[STEPSTRSIZE];
 
@@ -113,9 +133,9 @@ static void reset_all_counters(FILE *fplog, t_commrec *cr,
     md_print_warn(cr, fplog, "step %s: resetting all time and cycle counters\n",
                   gmx_step_str(step, sbuf));
 
-    if (cu_nbv)
+    if (use_GPU(nbv))
     {
-        nbnxn_cuda_reset_timings(cu_nbv);
+        nbnxn_gpu_reset_timings(nbv);
     }
 
     wallcycle_stop(wcycle, ewcRUN);
@@ -146,7 +166,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
              gmx_edsam_t ed, t_forcerec *fr,
              int repl_ex_nst, int repl_ex_nex, int repl_ex_seed, gmx_membed_t membed,
              real cpt_period, real max_hours,
-             const char gmx_unused *deviceOptions,
              int imdport,
              unsigned long Flags,
              gmx_walltime_accounting_t walltime_accounting)
@@ -169,9 +188,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     t_trxstatus      *status;
     rvec              mu_tot;
     t_vcm            *vcm;
-    t_state          *bufstate = NULL;
-    matrix           *scale_tot, pcoupl_mu, M, ebox;
-    gmx_nlheur_t      nlh;
+    matrix            pcoupl_mu, M;
     t_trxframe        rerun_fr;
     gmx_repl_ex_t     repl_ex = NULL;
     int               nchkpt  = 1;
@@ -184,43 +201,37 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     gmx_global_stat_t gstat;
     gmx_update_t      upd   = NULL;
     t_graph          *graph = NULL;
-    globsig_t         gs;
+    gmx_signalling_t  gs;
     gmx_groups_t     *groups;
-    gmx_ekindata_t   *ekind, *ekind_save;
+    gmx_ekindata_t   *ekind;
     gmx_shellfc_t     shellfc;
     int               count, nconverged = 0;
-    real              timestep   = 0;
-    double            tcount     = 0;
-    gmx_bool          bConverged = TRUE, bOK, bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
-    gmx_bool          bAppend;
+    double            tcount                 = 0;
+    gmx_bool          bConverged             = TRUE, bSumEkinhOld, bDoReplEx, bExchanged, bNeedRepartition;
     gmx_bool          bResetCountersHalfMaxH = FALSE;
-    gmx_bool          bVV, bIterativeCase, bFirstIterate, bTemp, bPres, bTrotter;
+    gmx_bool          bVV, bTemp, bPres, bTrotter;
     gmx_bool          bUpdateDoLR;
     real              dvdl_constr;
     rvec             *cbuf        = NULL;
     int               cbuf_nalloc = 0;
     matrix            lastbox;
-    real              veta_save, scalevir, tracevir;
-    real              vetanew = 0;
     int               lamnew  = 0;
     /* for FEP */
     int               nstfep;
     double            cycles;
     real              saved_conserved_quantity = 0;
     real              last_ekin                = 0;
-    int               iter_i;
     t_extmass         MassQ;
     int             **trotter_seq;
     char              sbuf[STEPSTRSIZE], sbuf2[STEPSTRSIZE];
     int               handled_stop_condition = gmx_stop_cond_none; /* compare to get_stop_condition*/
-    gmx_iterate_t     iterate;
-    gmx_int64_t       multisim_nsteps = -1;                        /* number of steps to do  before first multisim
+    gmx_int64_t       multisim_nsteps        = -1;                 /* number of steps to do  before first multisim
                                                                           simulation stops. If equal to zero, don't
                                                                           communicate any more between multisims.*/
     /* PME load balancing data for GPU kernels */
-    pme_load_balancing_t pme_loadbal = NULL;
-    double               cycles_pmes;
-    gmx_bool             bPMETuneTry = FALSE, bPMETuneRunning = FALSE;
+    pme_load_balancing_t *pme_loadbal;
+    gmx_bool              bPMETune         = FALSE;
+    gmx_bool              bPMETunePrinting = FALSE;
 
     /* Interactive MD */
     gmx_bool          bIMDstep = FALSE;
@@ -232,7 +243,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
     /* Check for special mdrun options */
     bRerunMD = (Flags & MD_RERUN);
-    bAppend  = (Flags & MD_APPENDFILES);
     if (Flags & MD_RESETCOUNTERSHALFWAY)
     {
         if (ir->nsteps > 0)
@@ -247,13 +257,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     /* md-vv uses averaged full step velocities for T-control
        md-vv-avek uses averaged half step velocities for T-control (but full step ekin for P control)
        md uses averaged half step kinetic energies to determine temperature unless defined otherwise by GMX_EKIN_AVE_VEL; */
-    bVV = EI_VV(ir->eI);
-    /* all the iteratative cases - only if there are constraints */
-    bIterativeCase = ((IR_NPH_TROTTER(ir) || IR_NPT_TROTTER(ir)) && (constr) && (!bRerunMD));
-    gmx_iterate_init(&iterate, FALSE); /* The default value of iterate->bIterationActive is set to
-                                          false in this step.  The correct value, true or false,
-                                          is set at each step, as it depends on the frequency of temperature
-                                          and pressure control.*/
+    bVV      = EI_VV(ir->eI);
     bTrotter = (bVV && (IR_NPT_TROTTER(ir) || IR_NPH_TROTTER(ir) || IR_NVT_TROTTER(ir)));
 
     if (bRerunMD)
@@ -271,19 +275,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     nstglobalcomm   = check_nstglobalcomm(fplog, cr, nstglobalcomm, ir);
     bGStatEveryStep = (nstglobalcomm == 1);
 
-    if (!bGStatEveryStep && ir->nstlist == -1 && fplog != NULL)
-    {
-        fprintf(fplog,
-                "To reduce the energy communication with nstlist = -1\n"
-                "the neighbor list validity should not be checked at every step,\n"
-                "this means that exact integration is not guaranteed.\n"
-                "The neighbor list validity is checked after:\n"
-                "  <n.list life time> - 2*std.dev.(n.list life time)  steps.\n"
-                "In most cases this will result in exact integration.\n"
-                "This reduces the energy communication by a factor of 2 to 3.\n"
-                "If you want less energy communication, set nstlist > 3.\n\n");
-    }
-
     if (bRerunMD)
     {
         ir->nstxout_compressed = 0;
@@ -315,9 +306,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     /* Kinetic energy data */
     snew(ekind, 1);
     init_ekindata(fplog, top_global, &(ir->opts), ekind);
-    /* needed for iteration of constraints */
-    snew(ekind_save, 1);
-    init_ekindata(fplog, top_global, &(ir->opts), ekind_save);
     /* Copy the cos acceleration to the groups struct */
     ekind->cosacc.cos_accel = ir->cos_accel;
 
@@ -350,7 +338,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         gmx_fatal(FARGS, "Normal Mode analysis is not supported with virtual sites.\nIf you'd like to help with adding support, we have an open discussion at http://redmine.gromacs.org/issues/879\n");
     }
 
-    if (bRerunMD && fr->cutoff_scheme == ecutsVERLET && ir->opts.ngener > 1 && fr->nbv && fr->nbv->bUseGPU)
+    if (bRerunMD && fr->cutoff_scheme == ecutsVERLET && ir->opts.ngener > 1 && usingGpu(fr->nbv))
     {
         gmx_fatal(FARGS, "The Verlet scheme on GPUs does not support energy groups, so your rerun should probably use a .tpr file without energy groups, or mdrun -nb auto");
     }
@@ -426,7 +414,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                             state_global, top_global, ir,
                             state, &f, mdatoms, top, fr,
                             vsite, shellfc, constr,
-                            nrnb, wcycle, FALSE);
+                            nrnb, NULL, FALSE);
 
     }
 
@@ -480,26 +468,16 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                         repl_ex_nst, repl_ex_nex, repl_ex_seed);
     }
 
-    /* PME tuning is only supported with GPUs or PME nodes and not with rerun.
-     * PME tuning is not supported with PME only for LJ and not for Coulomb.
+    /* PME tuning is only supported with PME for Coulomb. Is is not supported
+     * with only LJ PME, or for reruns.
      */
-    if ((Flags & MD_TUNEPME) &&
-        EEL_PME(fr->eeltype) &&
-        ( (fr->cutoff_scheme == ecutsVERLET && fr->nbv->bUseGPU) || !(cr->duty & DUTY_PME)) &&
-        !bRerunMD)
+    bPMETune = ((Flags & MD_TUNEPME) && EEL_PME(fr->eeltype) && !bRerunMD &&
+                !(Flags & MD_REPRODUCIBLE));
+    if (bPMETune)
     {
-        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata);
-        cycles_pmes = 0;
-        if (cr->duty & DUTY_PME)
-        {
-            /* Start tuning right away, as we can't measure the load */
-            bPMETuneRunning = TRUE;
-        }
-        else
-        {
-            /* Separate PME nodes, we can measure the PP/PME load balance */
-            bPMETuneTry = TRUE;
-        }
+        pme_loadbal_init(&pme_loadbal, ir, state->box, fr->ic, fr->pmedata,
+                         use_GPU(fr->nbv), !(cr->duty & DUTY_PME),
+                         &bPMETunePrinting);
     }
 
     if (!ir->bContinuation && !bRerunMD)
@@ -595,12 +573,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                                      and there is no previous step */
     }
 
-    /* if using an iterative algorithm, we need to create a working directory for the state. */
-    if (bIterativeCase)
-    {
-        bufstate = init_bufstate(state);
-    }
-
     /* need to make an initiation call to get the Trotter variables set, as well as other constants for non-trotter
        temperature control */
     trotter_seq = init_npt_vars(ir, state, &MassQ, bTrotter);
@@ -733,9 +705,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     bStateFromTPX    = !bStateFromCP;
     bInitStep        = bFirstStep && (bStateFromTPX || bVV);
     bStartingFromCpt = (Flags & MD_STARTFROMCPT) && bInitStep;
-    bLastStep        = FALSE;
     bSumEkinhOld     = FALSE;
-    bDoReplEx        = FALSE;
     bExchanged       = FALSE;
     bNeedRepartition = FALSE;
 
@@ -744,11 +714,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     step     = ir->init_step;
     step_rel = 0;
 
-    if (ir->nstlist == -1)
-    {
-        init_nlistheuristics(&nlh, bGStatEveryStep, step);
-    }
-
     if (MULTISIM(cr) && (repl_ex_nst <= 0 ))
     {
         /* check how many steps are left in other sims */
@@ -762,6 +727,20 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     while (!bLastStep || (bRerunMD && bNotLastFrame))
     {
 
+        /* Determine if this is a neighbor search step */
+        bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
+
+        if (bPMETune && bNStList)
+        {
+            /* PME grid + cut-off optimization with GPUs or PME nodes */
+            pme_loadbal_do(pme_loadbal, cr,
+                           (bVerbose && MASTER(cr)) ? stderr : NULL,
+                           fplog,
+                           ir, fr, state, wcycle,
+                           step, step_rel,
+                           &bPMETunePrinting);
+        }
+
         wallcycle_start(wcycle, ewcSTEP);
 
         if (bRerunMD)
@@ -875,16 +854,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         }
         else
         {
-            /* Determine whether or not to do Neighbour Searching and LR */
-            bNStList = (ir->nstlist > 0  && step % ir->nstlist == 0);
-
-            bNS = (bFirstStep || bExchanged || bNeedRepartition || bNStList || bDoFEP ||
-                   (ir->nstlist == -1 && nlh.nabnsb > 0));
-
-            if (bNS && ir->nstlist == -1)
-            {
-                set_nlistheuristics(&nlh, bFirstStep || bExchanged || bNeedRepartition || bDoFEP, step);
-            }
+            bNS = (bFirstStep || bExchanged || bNeedRepartition || bNStList || bDoFEP);
         }
 
         /* check whether we should stop because another simulation has
@@ -952,16 +922,13 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             if (DOMAINDECOMP(cr))
             {
                 /* Repartition the domain decomposition */
-                wallcycle_start(wcycle, ewcDOMDEC);
                 dd_partition_system(fplog, step, cr,
                                     bMasterState, nstglobalcomm,
                                     state_global, top_global, ir,
                                     state, &f, mdatoms, top, fr,
                                     vsite, shellfc, constr,
                                     nrnb, wcycle,
-                                    do_verbose && !bPMETuneRunning);
-                wallcycle_stop(wcycle, ewcDOMDEC);
-                /* If using an iterative integrator, reallocate space to match the decomposition */
+                                    do_verbose && !bPMETunePrinting);
             }
         }
 
@@ -1025,8 +992,8 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
 
         /* Do we need global communication ? */
         bGStat = (bCalcVir || bCalcEner || bStopCM ||
-                  do_per_step(step, nstglobalcomm) || (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)) ||
-                  (ir->nstlist == -1 && !bRerunMD && step >= nlh.step_nscheck));
+                  do_per_step(step, nstglobalcomm) ||
+                  (bVV && IR_NVT_TROTTER(ir) && do_per_step(step-1, nstglobalcomm)));
 
         do_ene = (do_per_step(step, ir->nstenergy) || bLastStep);
 
@@ -1133,151 +1100,92 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                           ekind, M, upd, bInitStep, etrtVELOCITY1,
                           cr, nrnb, constr, &top->idef);
 
-            if (bIterativeCase && do_per_step(step-1, ir->nstpcouple) && !bInitStep)
+            if (!bRerunMD || rerun_fr.bV || bForceUpdate)         /* Why is rerun_fr.bV here?  Unclear. */
             {
-                gmx_iterate_init(&iterate, TRUE);
+                wallcycle_stop(wcycle, ewcUPDATE);
+                update_constraints(fplog, step, NULL, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, shake_vir,
+                                   cr, nrnb, wcycle, upd, constr,
+                                   TRUE, bCalcVir);
+                wallcycle_start(wcycle, ewcUPDATE);
+                if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
+                {
+                    /* Correct the virial for multiple time stepping */
+                    m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
+                }
             }
-            /* for iterations, we save these vectors, as we will be self-consistently iterating
-               the calculations */
-
-            /*#### UPDATE EXTENDED VARIABLES IN TROTTER FORMULATION */
-
-            /* save the state */
-            if (iterate.bIterationActive)
+            else if (graph)
             {
-                copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
+                /* Need to unshift here if a do_force has been
+                   called in the previous step */
+                unshift_self(graph, state->box, state->x);
             }
-
-            bFirstIterate = TRUE;
-            while (bFirstIterate || iterate.bIterationActive)
+            /* if VV, compute the pressure and constraints */
+            /* For VV2, we strictly only need this if using pressure
+             * control, but we really would like to have accurate pressures
+             * printed out.
+             * Think about ways around this in the future?
+             * For now, keep this choice in comments.
+             */
+            /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
+            /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
+            bPres = TRUE;
+            bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
+            if (bCalcEner && ir->eI == eiVVAK)
             {
-                if (iterate.bIterationActive)
-                {
-                    copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
-                    if (bFirstIterate && bTrotter)
-                    {
-                        /* The first time through, we need a decent first estimate
-                           of veta(t+dt) to compute the constraints.  Do
-                           this by computing the box volume part of the
-                           trotter integration at this time. Nothing else
-                           should be changed by this routine here.  If
-                           !(first time), we start with the previous value
-                           of veta.  */
-
-                        veta_save = state->veta;
-                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ0);
-                        vetanew     = state->veta;
-                        state->veta = veta_save;
-                    }
-                }
-
-                bOK = TRUE;
-                if (!bRerunMD || rerun_fr.bV || bForceUpdate)     /* Why is rerun_fr.bV here?  Unclear. */
-                {
-                    wallcycle_stop(wcycle, ewcUPDATE);
-                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-                                       state, fr->bMolPBC, graph, f,
-                                       &top->idef, shake_vir,
-                                       cr, nrnb, wcycle, upd, constr,
-                                       TRUE, bCalcVir, vetanew);
-                    wallcycle_start(wcycle, ewcUPDATE);
-
-                    if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-                    {
-                        /* Correct the virial for multiple time stepping */
-                        m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-                    }
-
-                    if (!bOK)
-                    {
-                        gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
-                    }
-
-                }
-                else if (graph)
-                {
-                    /* Need to unshift here if a do_force has been
-                       called in the previous step */
-                    unshift_self(graph, state->box, state->x);
-                }
-
-                /* if VV, compute the pressure and constraints */
-                /* For VV2, we strictly only need this if using pressure
-                 * control, but we really would like to have accurate pressures
-                 * printed out.
-                 * Think about ways around this in the future?
-                 * For now, keep this choice in comments.
-                 */
-                /*bPres = (ir->eI==eiVV || IR_NPT_TROTTER(ir)); */
-                /*bTemp = ((ir->eI==eiVV &&(!bInitStep)) || (ir->eI==eiVVAK && IR_NPT_TROTTER(ir)));*/
-                bPres = TRUE;
-                bTemp = ((ir->eI == eiVV && (!bInitStep)) || (ir->eI == eiVVAK));
-                if (bCalcEner && ir->eI == eiVVAK)  /*MRS:  7/9/2010 -- this still doesn't fix it?*/
-                {
-                    bSumEkinhOld = TRUE;
-                }
-                /* for vv, the first half of the integration actually corresponds to the previous step.
-                   So we need information from the last step in the first half of the integration */
-                if (bGStat || do_per_step(step-1, nstglobalcomm))
+                bSumEkinhOld = TRUE;
+            }
+            /* for vv, the first half of the integration actually corresponds to the previous step.
+               So we need information from the last step in the first half of the integration */
+            if (bGStat || do_per_step(step-1, nstglobalcomm))
+            {
+                wallcycle_stop(wcycle, ewcUPDATE);
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, NULL, FALSE, state->box,
+                                top_global, &bSumEkinhOld,
+                                cglo_flags
+                                | CGLO_ENERGY
+                                | (bTemp ? CGLO_TEMPERATURE : 0)
+                                | (bPres ? CGLO_PRESSURE : 0)
+                                | (bPres ? CGLO_CONSTRAINT : 0)
+                                | CGLO_SCALEEKIN
+                                );
+                /* explanation of above:
+                   a) We compute Ekin at the full time step
+                   if 1) we are using the AveVel Ekin, and it's not the
+                   initial step, or 2) if we are using AveEkin, but need the full
+                   time step kinetic energy for the pressure (always true now, since we want accurate statistics).
+                   b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
+                   EkinAveVel because it's needed for the pressure */
+                wallcycle_start(wcycle, ewcUPDATE);
+            }
+            /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
+            if (!bInitStep)
+            {
+                if (bTrotter)
                 {
-                    wallcycle_stop(wcycle, ewcUPDATE);
-                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-                                    constr, NULL, FALSE, state->box,
-                                    top_global, &bSumEkinhOld,
-                                    cglo_flags
-                                    | CGLO_ENERGY
-                                    | (bTemp ? CGLO_TEMPERATURE : 0)
-                                    | (bPres ? CGLO_PRESSURE : 0)
-                                    | (bPres ? CGLO_CONSTRAINT : 0)
-                                    | ((iterate.bIterationActive) ? CGLO_ITERATE : 0)
-                                    | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
-                                    | CGLO_SCALEEKIN
-                                    );
-                    /* explanation of above:
-                       a) We compute Ekin at the full time step
-                       if 1) we are using the AveVel Ekin, and it's not the
-                       initial step, or 2) if we are using AveEkin, but need the full
-                       time step kinetic energy for the pressure (always true now, since we want accurate statistics).
-                       b) If we are using EkinAveEkin for the kinetic energy for the temperature control, we still feed in
-                       EkinAveVel because it's needed for the pressure */
-                    wallcycle_start(wcycle, ewcUPDATE);
+                    m_add(force_vir, shake_vir, total_vir);     /* we need the un-dispersion corrected total vir here */
+                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
                 }
-                /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
-                if (!bInitStep)
+                else
                 {
-                    if (bTrotter)
-                    {
-                        m_add(force_vir, shake_vir, total_vir); /* we need the un-dispersion corrected total vir here */
-                        trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ2);
-                    }
-                    else
+                    if (bExchanged)
                     {
-                        if (bExchanged)
-                        {
-                            wallcycle_stop(wcycle, ewcUPDATE);
-                            /* We need the kinetic energy at minus the half step for determining
-                             * the full step kinetic energy and possibly for T-coupling.*/
-                            /* This may not be quite working correctly yet . . . . */
-                            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-                                            wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
-                                            constr, NULL, FALSE, state->box,
-                                            top_global, &bSumEkinhOld,
-                                            CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
-                            wallcycle_start(wcycle, ewcUPDATE);
-                        }
+                        wallcycle_stop(wcycle, ewcUPDATE);
+                        /* We need the kinetic energy at minus the half step for determining
+                         * the full step kinetic energy and possibly for T-coupling.*/
+                        /* This may not be quite working correctly yet . . . . */
+                        compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                        wcycle, enerd, NULL, NULL, NULL, NULL, mu_tot,
+                                        constr, NULL, FALSE, state->box,
+                                        top_global, &bSumEkinhOld,
+                                        CGLO_RERUNMD | CGLO_GSTAT | CGLO_TEMPERATURE);
+                        wallcycle_start(wcycle, ewcUPDATE);
                     }
                 }
-
-                if (iterate.bIterationActive &&
-                    done_iterating(cr, fplog, step, &iterate, bFirstIterate,
-                                   state->veta, &vetanew))
-                {
-                    break;
-                }
-                bFirstIterate = FALSE;
             }
-
             if (bTrotter && !bInitStep)
             {
                 copy_mat(shake_vir, state->svir_prev);
@@ -1298,7 +1206,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             wallcycle_stop(wcycle, ewcUPDATE);
         }
 
-        /* MRS -- now done iterating -- compute the conserved quantity */
+        /* compute the conserved quantity */
         if (bVV)
         {
             saved_conserved_quantity = compute_conserved_from_auxiliary(ir, state, &MassQ);
@@ -1311,7 +1219,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 saved_conserved_quantity -= enerd->term[F_DISPCORR];
             }
             /* sum up the foreign energy and dhdl terms for vv.  currently done every step so that dhdl is correct in the .edr */
-            if (!bRerunMD)
+            if (ir->efep != efepNO && !bRerunMD)
             {
                 sum_dhdl(enerd, state->lambda, ir->fepvals);
             }
@@ -1406,31 +1314,6 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             gs.sig[eglsRESETCOUNTERS] = 1;
         }
 
-        if (ir->nstlist == -1 && !bRerunMD)
-        {
-            /* When bGStatEveryStep=FALSE, global_stat is only called
-             * when we check the atom displacements, not at NS steps.
-             * This means that also the bonded interaction count check is not
-             * performed immediately after NS. Therefore a few MD steps could
-             * be performed with missing interactions.
-             * But wrong energies are never written to file,
-             * since energies are only written after global_stat
-             * has been called.
-             */
-            if (step >= nlh.step_nscheck)
-            {
-                nlh.nabnsb = natoms_beyond_ns_buffer(ir, fr, &top->cgs,
-                                                     nlh.scale_tot, state->x);
-            }
-            else
-            {
-                /* This is not necessarily true,
-                 * but step_nscheck is determined quite conservatively.
-                 */
-                nlh.nabnsb = 0;
-            }
-        }
-
         /* In parallel we only have to check for checkpointing in steps
          * where we do global communication,
          *  otherwise the other nodes don't know.
@@ -1458,105 +1341,103 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                 /* if we have constraints, we have to remove the kinetic energy parallel to the bonds */
                 if (constr && bIfRandomize)
                 {
-                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
+                    update_constraints(fplog, step, NULL, ir, mdatoms,
                                        state, fr->bMolPBC, graph, f,
                                        &top->idef, tmp_vir,
                                        cr, nrnb, wcycle, upd, constr,
-                                       TRUE, bCalcVir, vetanew);
+                                       TRUE, bCalcVir);
                 }
             }
         }
+        /* #########   START SECOND UPDATE STEP ################# */
+        /* Box is changed in update() when we do pressure coupling,
+         * but we should still use the old box for energy corrections and when
+         * writing it to the energy file, so it matches the trajectory files for
+         * the same timestep above. Make a copy in a separate array.
+         */
+        copy_mat(state->box, lastbox);
 
-        if (bIterativeCase && do_per_step(step, ir->nstpcouple))
-        {
-            gmx_iterate_init(&iterate, TRUE);
-            /* for iterations, we save these vectors, as we will be redoing the calculations */
-            copy_coupling_state(state, bufstate, ekind, ekind_save, &(ir->opts));
-        }
+        dvdl_constr = 0;
 
-        bFirstIterate = TRUE;
-        while (bFirstIterate || iterate.bIterationActive)
+        if (!bRerunMD || rerun_fr.bV || bForceUpdate)
         {
-            /* We now restore these vectors to redo the calculation with improved extended variables */
-            if (iterate.bIterationActive)
+            wallcycle_start(wcycle, ewcUPDATE);
+            /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
+            if (bTrotter)
+            {
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
+                /* We can only do Berendsen coupling after we have summed
+                 * the kinetic energy or virial. Since the happens
+                 * in global_state after update, we should only do it at
+                 * step % nstlist = 1 with bGStatEveryStep=FALSE.
+                 */
+            }
+            else
             {
-                copy_coupling_state(bufstate, state, ekind_save, ekind, &(ir->opts));
+                update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
+                update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
             }
 
-            /* We make the decision to break or not -after- the calculation of Ekin and Pressure,
-               so scroll down for that logic */
+            if (bVV)
+            {
+                bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
-            /* #########   START SECOND UPDATE STEP ################# */
-            /* Box is changed in update() when we do pressure coupling,
-             * but we should still use the old box for energy corrections and when
-             * writing it to the energy file, so it matches the trajectory files for
-             * the same timestep above. Make a copy in a separate array.
-             */
-            copy_mat(state->box, lastbox);
+                /* velocity half-step update */
+                update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+                              bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
+                              ekind, M, upd, FALSE, etrtVELOCITY2,
+                              cr, nrnb, constr, &top->idef);
+            }
 
-            bOK         = TRUE;
-            dvdl_constr = 0;
+            /* Above, initialize just copies ekinh into ekin,
+             * it doesn't copy position (for VV),
+             * and entire integrator for MD.
+             */
 
-            if (!(bRerunMD && !rerun_fr.bV && !bForceUpdate))
+            if (ir->eI == eiVVAK)
             {
-                wallcycle_start(wcycle, ewcUPDATE);
-                /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
-                if (bTrotter)
-                {
-                    if (iterate.bIterationActive)
-                    {
-                        if (bFirstIterate)
-                        {
-                            scalevir = 1;
-                        }
-                        else
-                        {
-                            /* we use a new value of scalevir to converge the iterations faster */
-                            scalevir = tracevir/trace(shake_vir);
-                        }
-                        msmul(shake_vir, scalevir, shake_vir);
-                        m_add(force_vir, shake_vir, total_vir);
-                        clear_mat(shake_vir);
-                    }
-                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ3);
-                    /* We can only do Berendsen coupling after we have summed
-                     * the kinetic energy or virial. Since the happens
-                     * in global_state after update, we should only do it at
-                     * step % nstlist = 1 with bGStatEveryStep=FALSE.
-                     */
-                }
-                else
+                /* We probably only need md->homenr, not state->natoms */
+                if (state->natoms > cbuf_nalloc)
                 {
-                    update_tcouple(step, ir, state, ekind, &MassQ, mdatoms);
-                    update_pcouple(fplog, step, ir, state, pcoupl_mu, M, bInitStep);
+                    cbuf_nalloc = state->natoms;
+                    srenew(cbuf, cbuf_nalloc);
                 }
+                copy_rvecn(state->x, cbuf, 0, state->natoms);
+            }
+            bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
-                if (bVV)
-                {
-                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
+            update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
+                          bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
+                          ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
+            wallcycle_stop(wcycle, ewcUPDATE);
 
-                    /* velocity half-step update */
-                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-                                  ekind, M, upd, FALSE, etrtVELOCITY2,
-                                  cr, nrnb, constr, &top->idef);
-                }
+            update_constraints(fplog, step, &dvdl_constr, ir, mdatoms, state,
+                               fr->bMolPBC, graph, f,
+                               &top->idef, shake_vir,
+                               cr, nrnb, wcycle, upd, constr,
+                               FALSE, bCalcVir);
 
-                /* Above, initialize just copies ekinh into ekin,
-                 * it doesn't copy position (for VV),
-                 * and entire integrator for MD.
-                 */
+            if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
+            {
+                /* Correct the virial for multiple time stepping */
+                m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
+            }
+
+            if (ir->eI == eiVVAK)
+            {
+                /* erase F_EKIN and F_TEMP here? */
+                /* just compute the kinetic energy at the half step to perform a trotter step */
+                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                                constr, NULL, FALSE, lastbox,
+                                top_global, &bSumEkinhOld,
+                                cglo_flags | CGLO_TEMPERATURE
+                                );
+                wallcycle_start(wcycle, ewcUPDATE);
+                trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
+                /* now we know the scaling, we can compute the positions again again */
+                copy_rvecn(cbuf, state->x, 0, state->natoms);
 
-                if (ir->eI == eiVVAK)
-                {
-                    /* We probably only need md->homenr, not state->natoms */
-                    if (state->natoms > cbuf_nalloc)
-                    {
-                        cbuf_nalloc = state->natoms;
-                        srenew(cbuf, cbuf_nalloc);
-                    }
-                    copy_rvecn(state->x, cbuf, 0, state->natoms);
-                }
                 bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
 
                 update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
@@ -1564,166 +1445,102 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                               ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
                 wallcycle_stop(wcycle, ewcUPDATE);
 
-                update_constraints(fplog, step, &dvdl_constr, ir, ekind, mdatoms, state,
-                                   fr->bMolPBC, graph, f,
-                                   &top->idef, shake_vir,
-                                   cr, nrnb, wcycle, upd, constr,
-                                   FALSE, bCalcVir, state->veta);
-
-                if (bCalcVir && bUpdateDoLR && ir->nstcalclr > 1)
-                {
-                    /* Correct the virial for multiple time stepping */
-                    m_sub(shake_vir, fr->vir_twin_constr, shake_vir);
-                }
-
-                if (ir->eI == eiVVAK)
-                {
-                    /* erase F_EKIN and F_TEMP here? */
-                    /* just compute the kinetic energy at the half step to perform a trotter step */
-                    compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-                                    wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-                                    constr, NULL, FALSE, lastbox,
-                                    top_global, &bSumEkinhOld,
-                                    cglo_flags | CGLO_TEMPERATURE
-                                    );
-                    wallcycle_start(wcycle, ewcUPDATE);
-                    trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, &MassQ, trotter_seq, ettTSEQ4);
-                    /* now we know the scaling, we can compute the positions again again */
-                    copy_rvecn(cbuf, state->x, 0, state->natoms);
-
-                    bUpdateDoLR = (fr->bTwinRange && do_per_step(step, ir->nstcalclr));
-
-                    update_coords(fplog, step, ir, mdatoms, state, fr->bMolPBC, f,
-                                  bUpdateDoLR, fr->f_twin, bCalcVir ? &fr->vir_twin_constr : NULL, fcd,
-                                  ekind, M, upd, bInitStep, etrtPOSITION, cr, nrnb, constr, &top->idef);
-                    wallcycle_stop(wcycle, ewcUPDATE);
-
-                    /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
-                    /* are the small terms in the shake_vir here due
-                     * to numerical errors, or are they important
-                     * physically? I'm thinking they are just errors, but not completely sure.
-                     * For now, will call without actually constraining, constr=NULL*/
-                    update_constraints(fplog, step, NULL, ir, ekind, mdatoms,
-                                       state, fr->bMolPBC, graph, f,
-                                       &top->idef, tmp_vir,
-                                       cr, nrnb, wcycle, upd, NULL,
-                                       FALSE, bCalcVir,
-                                       state->veta);
-                }
-                if (!bOK)
-                {
-                    gmx_fatal(FARGS, "Constraint error: Shake, Lincs or Settle could not solve the constrains");
-                }
-
-                if (fr->bSepDVDL && fplog && do_log)
-                {
-                    gmx_print_sepdvdl(fplog, "Constraint dV/dl", 0.0, dvdl_constr);
-                }
-                if (bVV)
-                {
-                    /* this factor or 2 correction is necessary
-                       because half of the constraint force is removed
-                       in the vv step, so we have to double it.  See
-                       the Redmine issue #1255.  It is not yet clear
-                       if the factor of 2 is exact, or just a very
-                       good approximation, and this will be
-                       investigated.  The next step is to see if this
-                       can be done adding a dhdl contribution from the
-                       rattle step, but this is somewhat more
-                       complicated with the current code. Will be
-                       investigated, hopefully for 4.6.3. However,
-                       this current solution is much better than
-                       having it completely wrong.
-                     */
-                    enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
-                }
-                else
-                {
-                    enerd->term[F_DVDL_CONSTR] += dvdl_constr;
-                }
+                /* do we need an extra constraint here? just need to copy out of state->v to upd->xp? */
+                /* are the small terms in the shake_vir here due
+                 * to numerical errors, or are they important
+                 * physically? I'm thinking they are just errors, but not completely sure.
+                 * For now, will call without actually constraining, constr=NULL*/
+                update_constraints(fplog, step, NULL, ir, mdatoms,
+                                   state, fr->bMolPBC, graph, f,
+                                   &top->idef, tmp_vir,
+                                   cr, nrnb, wcycle, upd, NULL,
+                                   FALSE, bCalcVir);
             }
-            else if (graph)
+            if (bVV)
             {
-                /* Need to unshift here */
-                unshift_self(graph, state->box, state->x);
+                /* this factor or 2 correction is necessary
+                   because half of the constraint force is removed
+                   in the vv step, so we have to double it.  See
+                   the Redmine issue #1255.  It is not yet clear
+                   if the factor of 2 is exact, or just a very
+                   good approximation, and this will be
+                   investigated.  The next step is to see if this
+                   can be done adding a dhdl contribution from the
+                   rattle step, but this is somewhat more
+                   complicated with the current code. Will be
+                   investigated, hopefully for 4.6.3. However,
+                   this current solution is much better than
+                   having it completely wrong.
+                 */
+                enerd->term[F_DVDL_CONSTR] += 2*dvdl_constr;
             }
-
-            if (vsite != NULL)
+            else
             {
-                wallcycle_start(wcycle, ewcVSITECONSTR);
-                if (graph != NULL)
-                {
-                    shift_self(graph, state->box, state->x);
-                }
-                construct_vsites(vsite, state->x, ir->delta_t, state->v,
-                                 top->idef.iparams, top->idef.il,
-                                 fr->ePBC, fr->bMolPBC, cr, state->box);
-
-                if (graph != NULL)
-                {
-                    unshift_self(graph, state->box, state->x);
-                }
-                wallcycle_stop(wcycle, ewcVSITECONSTR);
+                enerd->term[F_DVDL_CONSTR] += dvdl_constr;
             }
+        }
+        else if (graph)
+        {
+            /* Need to unshift here */
+            unshift_self(graph, state->box, state->x);
+        }
 
-            /* ############## IF NOT VV, Calculate globals HERE, also iterate constraints  ############ */
-            /* With Leap-Frog we can skip compute_globals at
-             * non-communication steps, but we need to calculate
-             * the kinetic energy one step before communication.
-             */
-            if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
+        if (vsite != NULL)
+        {
+            wallcycle_start(wcycle, ewcVSITECONSTR);
+            if (graph != NULL)
             {
-                if (ir->nstlist == -1 && bFirstIterate)
-                {
-                    gs.sig[eglsNABNSB] = nlh.nabnsb;
-                }
-                compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
-                                wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
-                                constr,
-                                bFirstIterate ? &gs : NULL,
-                                (step_rel % gs.nstms == 0) &&
-                                (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
-                                lastbox,
-                                top_global, &bSumEkinhOld,
-                                cglo_flags
-                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
-                                | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
-                                | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
-                                | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
-                                | (iterate.bIterationActive ? CGLO_ITERATE : 0)
-                                | (bFirstIterate ? CGLO_FIRSTITERATE : 0)
-                                | CGLO_CONSTRAINT
-                                );
-                if (ir->nstlist == -1 && bFirstIterate)
-                {
-                    nlh.nabnsb         = gs.set[eglsNABNSB];
-                    gs.set[eglsNABNSB] = 0;
-                }
+                shift_self(graph, state->box, state->x);
             }
-            /* bIterate is set to keep it from eliminating the old ekin kinetic energy terms */
-            /* #############  END CALC EKIN AND PRESSURE ################# */
-
-            /* Note: this is OK, but there are some numerical precision issues with using the convergence of
-               the virial that should probably be addressed eventually. state->veta has better properies,
-               but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
-               generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+            construct_vsites(vsite, state->x, ir->delta_t, state->v,
+                             top->idef.iparams, top->idef.il,
+                             fr->ePBC, fr->bMolPBC, cr, state->box);
 
-            if (iterate.bIterationActive &&
-                done_iterating(cr, fplog, step, &iterate, bFirstIterate,
-                               trace(shake_vir), &tracevir))
+            if (graph != NULL)
             {
-                break;
+                unshift_self(graph, state->box, state->x);
             }
-            bFirstIterate = FALSE;
+            wallcycle_stop(wcycle, ewcVSITECONSTR);
+        }
+
+        /* ############## IF NOT VV, Calculate globals HERE  ############ */
+        /* With Leap-Frog we can skip compute_globals at
+         * non-communication steps, but we need to calculate
+         * the kinetic energy one step before communication.
+         */
+        if (bGStat || (!EI_VV(ir->eI) && do_per_step(step+1, nstglobalcomm)))
+        {
+            compute_globals(fplog, gstat, cr, ir, fr, ekind, state, state_global, mdatoms, nrnb, vcm,
+                            wcycle, enerd, force_vir, shake_vir, total_vir, pres, mu_tot,
+                            constr, &gs,
+                            (step_rel % gs.nstms == 0) &&
+                            (multisim_nsteps < 0 || (step_rel < multisim_nsteps)),
+                            lastbox,
+                            top_global, &bSumEkinhOld,
+                            cglo_flags
+                            | (!EI_VV(ir->eI) || bRerunMD ? CGLO_ENERGY : 0)
+                            | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+                            | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+                            | (!EI_VV(ir->eI) || bRerunMD ? CGLO_PRESSURE : 0)
+                            | CGLO_CONSTRAINT
+                            );
         }
 
-        if (!bVV || bRerunMD)
+        /* #############  END CALC EKIN AND PRESSURE ################# */
+
+        /* Note: this is OK, but there are some numerical precision issues with using the convergence of
+           the virial that should probably be addressed eventually. state->veta has better properies,
+           but what we actually need entering the new cycle is the new shake_vir value. Ideally, we could
+           generate the new shake_vir, but test the veta value for convergence.  This will take some thought. */
+
+        if (ir->efep != efepNO && (!bVV || bRerunMD))
         {
-            /* sum up the foreign energy and dhdl terms for md and sd. currently done every step so that dhdl is correct in the .edr */
+            /* Sum up the foreign energy and dhdl terms for md and sd.
+               Currently done every step so that dhdl is correct in the .edr */
             sum_dhdl(enerd, state->lambda, ir->fepvals);
         }
         update_box(fplog, step, ir, mdatoms, state, f,
-                   ir->nstlist == -1 ? &nlh.scale_tot : NULL, pcoupl_mu, nrnb, upd);
+                   pcoupl_mu, nrnb, upd);
 
         /* ################# END UPDATE STEP 2 ################# */
         /* #### We now have r(t+dt) and v(t+dt/2)  ############# */
@@ -1789,9 +1606,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
                            step, t,
                            eprNORMAL, bCompact, mdebin, fcd, groups, &(ir->opts));
             }
-            if (ir->ePull != epullNO)
+            if (ir->bPull)
             {
-                pull_print_output(ir->pull, step, t);
+                pull_print_output(ir->pull_work, step, t);
             }
 
             if (do_per_step(step, ir->nstlog))
@@ -1809,7 +1626,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             state->fep_state = lamnew;
         }
         /* Print the remaining wall clock time for the run */
-        if (MULTIMASTER(cr) && (do_verbose || gmx_got_usr_signal()) && !bPMETuneRunning)
+        if (MULTIMASTER(cr) &&
+            (do_verbose || gmx_got_usr_signal()) &&
+            !bPMETunePrinting)
         {
             if (shellfc)
             {
@@ -1893,97 +1712,17 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
             }
         }
 
-        if (!bRerunMD || !rerun_fr.bStep)
-        {
-            /* increase the MD step number */
-            step++;
-            step_rel++;
-        }
-
         cycles = wallcycle_stop(wcycle, ewcSTEP);
         if (DOMAINDECOMP(cr) && wcycle)
         {
             dd_cycles_add(cr->dd, cycles, ddCyclStep);
         }
 
-        if (bPMETuneRunning || bPMETuneTry)
+        if (!bRerunMD || !rerun_fr.bStep)
         {
-            /* PME grid + cut-off optimization with GPUs or PME nodes */
-
-            /* Count the total cycles over the last steps */
-            cycles_pmes += cycles;
-
-            /* We can only switch cut-off at NS steps */
-            if (step % ir->nstlist == 0)
-            {
-                /* PME grid + cut-off optimization with GPUs or PME nodes */
-                if (bPMETuneTry)
-                {
-                    if (DDMASTER(cr->dd))
-                    {
-                        /* PME node load is too high, start tuning */
-                        bPMETuneRunning = (dd_pme_f_ratio(cr->dd) >= 1.05);
-                    }
-                    dd_bcast(cr->dd, sizeof(gmx_bool), &bPMETuneRunning);
-
-                    if (bPMETuneRunning &&
-                        fr->nbv->bUseGPU && DOMAINDECOMP(cr) &&
-                        !(cr->duty & DUTY_PME))
-                    {
-                        /* Lock DLB=auto to off (does nothing when DLB=yes/no).
-                         * With GPUs + separate PME ranks, we don't want DLB.
-                         * This could happen when we scan coarse grids and
-                         * it would then never be turned off again.
-                         * This would hurt performance at the final, optimal
-                         * grid spacing, where DLB almost never helps.
-                         * Also, DLB can limit the cut-off for PME tuning.
-                         */
-                        dd_dlb_set_lock(cr->dd, TRUE);
-                    }
-
-                    if (bPMETuneRunning || step_rel > ir->nstlist*50)
-                    {
-                        bPMETuneTry     = FALSE;
-                    }
-                }
-                if (bPMETuneRunning)
-                {
-                    /* init_step might not be a multiple of nstlist,
-                     * but the first cycle is always skipped anyhow.
-                     */
-                    bPMETuneRunning =
-                        pme_load_balance(pme_loadbal, cr,
-                                         (bVerbose && MASTER(cr)) ? stderr : NULL,
-                                         fplog,
-                                         ir, state, cycles_pmes,
-                                         fr->ic, fr->nbv, &fr->pmedata,
-                                         step);
-
-                    /* Update constants in forcerec/inputrec to keep them in sync with fr->ic */
-                    fr->ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
-                    fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
-                    fr->rlist         = fr->ic->rlist;
-                    fr->rlistlong     = fr->ic->rlistlong;
-                    fr->rcoulomb      = fr->ic->rcoulomb;
-                    fr->rvdw          = fr->ic->rvdw;
-
-                    if (ir->eDispCorr != edispcNO)
-                    {
-                        calc_enervirdiff(NULL, ir->eDispCorr, fr);
-                    }
-
-                    if (!bPMETuneRunning &&
-                        DOMAINDECOMP(cr) &&
-                        dd_dlb_is_locked(cr->dd))
-                    {
-                        /* Unlock the DLB=auto, DLB is allowed to activate
-                         * (but we don't expect it to activate in most cases).
-                         */
-                        dd_dlb_set_lock(cr->dd, FALSE);
-                    }
-                }
-                cycles_pmes = 0;
-            }
+            /* increase the MD step number */
+            step++;
+            step_rel++;
         }
 
         if (step_rel == wcycle_get_reset_counters(wcycle) ||
@@ -1991,7 +1730,7 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
         {
             /* Reset all the counters related to performance over the run */
             reset_all_counters(fplog, cr, step, &step_rel, ir, wcycle, nrnb, walltime_accounting,
-                               fr->nbv != NULL && fr->nbv->bUseGPU ? fr->nbv->cu_nbv : NULL);
+                               use_GPU(fr->nbv) ? fr->nbv : NULL);
             wcycle_set_reset_counters(wcycle, -1);
             if (!(cr->duty & DUTY_PME))
             {
@@ -2041,16 +1780,9 @@ double do_md(FILE *fplog, t_commrec *cr, int nfile, const t_filenm fnm[],
     done_mdoutf(outf);
     debug_gmx();
 
-    if (ir->nstlist == -1 && nlh.nns > 0 && fplog)
-    {
-        fprintf(fplog, "Average neighborlist lifetime: %.1f steps, std.dev.: %.1f steps\n", nlh.s1/nlh.nns, sqrt(nlh.s2/nlh.nns - sqr(nlh.s1/nlh.nns)));
-        fprintf(fplog, "Average number of atoms that crossed the half buffer length: %.1f\n\n", nlh.ab/nlh.nns);
-    }
-
-    if (pme_loadbal != NULL)
+    if (bPMETune)
     {
-        pme_loadbal_done(pme_loadbal, cr, fplog,
-                         fr->nbv != NULL && fr->nbv->bUseGPU);
+        pme_loadbal_done(pme_loadbal, cr, fplog, use_GPU(fr->nbv));
     }
 
     if (shellfc && fplog)