Manually sort some includes in src/gromacs

[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.c
diff --git a/src/gromacs/mdlib/sim_util.c b/src/gromacs/mdlib/sim_util.c

index 538d1caf61f60ad4f1e046eaf2f83caf42fe42aa..4090760008135c7cce14eacd163c55c11f2108a5 100644 (file)
--- a/src/gromacs/mdlib/sim_util.c
+++ b/src/gromacs/mdlib/sim_util.c
@@ -1,136 +1,113 @@
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
- *
- *
- *                This source code is part of
- *
- *                 G   R   O   M   A   C   S
- *
- *          GROningen MAchine for Chemical Simulations
+/*
+ * This file is part of the GROMACS molecular simulation package.
   *
- *                        VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
   * of the License, or (at your option) any later version.
   *
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
   *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
   *
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
   *
- * And Hey:
- * GROwing Monsters And Cloning Shrimps
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
   */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#ifdef GMX_CRAY_XT3
-#include <catamount/dclock.h>
-#endif
+#include "gmxpre.h"
  
+#include "gromacs/legacyheaders/sim_util.h"
  
+#include <assert.h>
+#include <math.h>
  #include <stdio.h>
-#include <time.h>
+#include <string.h>
+
+#include "config.h"
+
  #ifdef HAVE_SYS_TIME_H
  #include <sys/time.h>
  #endif
-#include <math.h>
-#include "typedefs.h"
-#include "string2.h"
-#include "gmxfio.h"
-#include "smalloc.h"
-#include "names.h"
-#include "confio.h"
-#include "mvdata.h"
-#include "txtdump.h"
-#include "pbc.h"
-#include "chargegroup.h"
-#include "vec.h"
-#include <time.h>
-#include "nrnb.h"
-#include "mshift.h"
-#include "mdrun.h"
-#include "sim_util.h"
-#include "update.h"
-#include "physics.h"
-#include "main.h"
-#include "mdatoms.h"
-#include "force.h"
-#include "bondf.h"
-#include "pme.h"
-#include "disre.h"
-#include "orires.h"
-#include "network.h"
-#include "calcmu.h"
-#include "constr.h"
-#include "xvgr.h"
-#include "trnio.h"
-#include "xtcio.h"
-#include "copyrite.h"
-#include "pull_rotation.h"
-#include "gmx_random.h"
-#include "domdec.h"
-#include "partdec.h"
-#include "gmx_wallcycle.h"
-#include "genborn.h"
-#include "nbnxn_atomdata.h"
-#include "nbnxn_search.h"
-#include "nbnxn_kernels/nbnxn_kernel_ref.h"
-#include "nbnxn_kernels/nbnxn_kernel_simd_4xn.h"
-#include "nbnxn_kernels/nbnxn_kernel_simd_2xnn.h"
-#include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
  
+#include "gromacs/bonded/bonded.h"
+#include "gromacs/essentialdynamics/edsam.h"
+#include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
+#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
+#include "gromacs/imd/imd.h"
+#include "gromacs/legacyheaders/calcmu.h"
+#include "gromacs/legacyheaders/chargegroup.h"
+#include "gromacs/legacyheaders/constr.h"
+#include "gromacs/legacyheaders/copyrite.h"
+#include "gromacs/legacyheaders/disre.h"
+#include "gromacs/legacyheaders/domdec.h"
+#include "gromacs/legacyheaders/force.h"
+#include "gromacs/legacyheaders/genborn.h"
+#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+#include "gromacs/legacyheaders/mdatoms.h"
+#include "gromacs/legacyheaders/mdrun.h"
+#include "gromacs/legacyheaders/names.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nonbonded.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/orires.h"
+#include "gromacs/legacyheaders/pme.h"
+#include "gromacs/legacyheaders/qmmm.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/update.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/adress.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_atomdata.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
+#include "gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_ref.h"
+#include "gromacs/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
+#include "gromacs/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/mshift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/timing/walltime_accounting.h"
+#include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/smalloc.h"
  
-#include "adress.h"
-#include "qmmm.h"
-
-#include "nbnxn_cuda_data_mgmt.h"
-#include "nbnxn_cuda/nbnxn_cuda.h"
-
-double
-gmx_gettime()
-{
-#ifdef HAVE_GETTIMEOFDAY
-    struct timeval t;
-    double         seconds;
-
-    gettimeofday(&t, NULL);
-
-    seconds = (double) t.tv_sec + 1e-6*(double)t.tv_usec;
-
-    return seconds;
-#else
-    double  seconds;
-
-    seconds = time(NULL);
-
-    return seconds;
-#endif
-}
-
-
-#define difftime(end, start) ((double)(end)-(double)(start))
-
-void print_time(FILE *out, gmx_runtime_t *runtime, gmx_large_int_t step,
-                t_inputrec *ir, t_commrec gmx_unused *cr)
+void print_time(FILE                     *out,
+                gmx_walltime_accounting_t walltime_accounting,
+                gmx_int64_t               step,
+                t_inputrec               *ir,
+                t_commrec gmx_unused     *cr)
  {
      time_t finish;
      char   timebuf[STRLEN];
-    double dt;
+    double dt, elapsed_seconds, time_per_step;
      char   buf[48];
  
  #ifndef GMX_THREAD_MPI
@@ -142,17 +119,16 @@ void print_time(FILE *out, gmx_runtime_t *runtime, gmx_large_int_t step,
      fprintf(out, "step %s", gmx_step_str(step, buf));
      if ((step >= ir->nstlist))
      {
-        runtime->last          = gmx_gettime();
-        dt                     = difftime(runtime->last, runtime->real);
-        runtime->time_per_step = dt/(step - ir->init_step + 1);
-
-        dt = (ir->nsteps + ir->init_step - step)*runtime->time_per_step;
+        double seconds_since_epoch = gmx_gettime();
+        elapsed_seconds = seconds_since_epoch - walltime_accounting_get_start_time_stamp(walltime_accounting);
+        time_per_step   = elapsed_seconds/(step - ir->init_step + 1);
+        dt              = (ir->nsteps + ir->init_step - step) * time_per_step;
  
          if (ir->nsteps >= 0)
          {
              if (dt >= 300)
              {
-                finish = (time_t) (runtime->last + dt);
+                finish = (time_t) (seconds_since_epoch + dt);
                  gmx_ctime_r(&finish, timebuf, STRLEN);
                  sprintf(buf, "%s", timebuf);
                  buf[strlen(buf)-1] = '\0';
@@ -160,13 +136,13 @@ void print_time(FILE *out, gmx_runtime_t *runtime, gmx_large_int_t step,
              }
              else
              {
-                fprintf(out, ", remaining runtime: %5d s          ", (int)dt);
+                fprintf(out, ", remaining wall clock time: %5d s          ", (int)dt);
              }
          }
          else
          {
              fprintf(out, " performance: %.1f ns/day    ",
-                    ir->delta_t/1000*24*60*60/runtime->time_per_step);
+                    ir->delta_t/1000*24*60*60/time_per_step);
          }
      }
  #ifndef GMX_THREAD_MPI
@@ -179,92 +155,41 @@ void print_time(FILE *out, gmx_runtime_t *runtime, gmx_large_int_t step,
      fflush(out);
  }
  
-#ifdef NO_CLOCK
-#define clock() -1
-#endif
-
-static double set_proctime(gmx_runtime_t *runtime)
+void print_date_and_time(FILE *fplog, int nodeid, const char *title,
+                         double the_time)
  {
-    double diff;
-#ifdef GMX_CRAY_XT3
-    double prev;
-
-    prev          = runtime->proc;
-    runtime->proc = dclock();
-
-    diff = runtime->proc - prev;
-#else
-    clock_t prev;
-
-    prev          = runtime->proc;
-    runtime->proc = clock();
+    char   time_string[STRLEN];
  
-    diff = (double)(runtime->proc - prev)/(double)CLOCKS_PER_SEC;
-#endif
-    if (diff < 0)
+    if (!fplog)
      {
-        /* The counter has probably looped, ignore this data */
-        diff = 0;
+        return;
      }
  
-    return diff;
-}
-
-void runtime_start(gmx_runtime_t *runtime)
-{
-    runtime->real          = gmx_gettime();
-    runtime->proc          = 0;
-    set_proctime(runtime);
-    runtime->realtime      = 0;
-    runtime->proctime      = 0;
-    runtime->last          = 0;
-    runtime->time_per_step = 0;
-}
-
-void runtime_end(gmx_runtime_t *runtime)
-{
-    double now;
-
-    now = gmx_gettime();
-
-    runtime->proctime += set_proctime(runtime);
-    runtime->realtime  = now - runtime->real;
-    runtime->real      = now;
-}
-
-void runtime_upd_proc(gmx_runtime_t *runtime)
-{
-    runtime->proctime += set_proctime(runtime);
-}
-
-void print_date_and_time(FILE *fplog, int nodeid, const char *title,
-                         const gmx_runtime_t *runtime)
-{
-    int    i;
-    char   timebuf[STRLEN];
-    char   time_string[STRLEN];
-    time_t tmptime;
-
-    if (fplog)
      {
-        if (runtime != NULL)
-        {
-            tmptime = (time_t) runtime->real;
-            gmx_ctime_r(&tmptime, timebuf, STRLEN);
-        }
-        else
-        {
-            tmptime = (time_t) gmx_gettime();
-            gmx_ctime_r(&tmptime, timebuf, STRLEN);
-        }
+        int    i;
+        char   timebuf[STRLEN];
+        time_t temp_time = (time_t) the_time;
+
+        gmx_ctime_r(&temp_time, timebuf, STRLEN);
          for (i = 0; timebuf[i] >= ' '; i++)
          {
              time_string[i] = timebuf[i];
          }
          time_string[i] = '\0';
-
-        fprintf(fplog, "%s on node %d %s\n", title, nodeid, time_string);
      }
+
+    fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
+}
+
+void print_start(FILE *fplog, t_commrec *cr,
+                 gmx_walltime_accounting_t walltime_accounting,
+                 const char *name)
+{
+    char buf[STRLEN];
+
+    sprintf(buf, "Started %s", name);
+    print_date_and_time(fplog, cr->nodeid, buf,
+                        walltime_accounting_get_start_time_stamp(walltime_accounting));
  }
  
  static void sum_forces(int start, int end, rvec f[], rvec flr[])
@@ -388,9 +313,7 @@ static void calc_virial(int start, int homenr, rvec x[], rvec f[],
      }
  }
  
-static void posres_wrapper(FILE *fplog,
-                           int flags,
-                           gmx_bool bSepDVDL,
+static void posres_wrapper(int flags,
                             t_inputrec *ir,
                             t_nrnb *nrnb,
                             gmx_localtop_t *top,
@@ -412,10 +335,6 @@ static void posres_wrapper(FILE *fplog,
                    ir->ePBC == epbcNONE ? NULL : &pbc,
                    lambda[efptRESTRAINT], &dvdl,
                    fr->rc_scaling, fr->ePBC, fr->posres_com, fr->posres_comB);
-    if (bSepDVDL)
-    {
-        gmx_print_sepdvdl(fplog, interaction_function[F_POSRES].longname, v, dvdl);
-    }
      enerd->term[F_POSRES] += v;
      /* If just the force constant changes, the FEP term is linear,
       * but if k changes, it is not.
@@ -440,9 +359,28 @@ static void posres_wrapper(FILE *fplog,
      }
  }
  
-static void pull_potential_wrapper(FILE *fplog,
-                                   gmx_bool bSepDVDL,
-                                   t_commrec *cr,
+static void fbposres_wrapper(t_inputrec *ir,
+                             t_nrnb *nrnb,
+                             gmx_localtop_t *top,
+                             matrix box, rvec x[],
+                             gmx_enerdata_t *enerd,
+                             t_forcerec *fr)
+{
+    t_pbc pbc;
+    real  v;
+
+    /* Flat-bottomed position restraints always require full pbc */
+    set_pbc(&pbc, ir->ePBC, box);
+    v = fbposres(top->idef.il[F_FBPOSRES].nr, top->idef.il[F_FBPOSRES].iatoms,
+                 top->idef.iparams_fbposres,
+                 (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
+                 ir->ePBC == epbcNONE ? NULL : &pbc,
+                 fr->rc_scaling, fr->ePBC, fr->posres_com);
+    enerd->term[F_FBPOSRES] += v;
+    inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
+}
+
+static void pull_potential_wrapper(t_commrec *cr,
                                     t_inputrec *ir,
                                     matrix box, rvec x[],
                                     rvec f[],
@@ -450,7 +388,8 @@ static void pull_potential_wrapper(FILE *fplog,
                                     t_mdatoms *mdatoms,
                                     gmx_enerdata_t *enerd,
                                     real *lambda,
-                                   double t)
+                                   double t,
+                                   gmx_wallcycle_t wcycle)
  {
      t_pbc  pbc;
      real   dvdl;
@@ -460,26 +399,22 @@ static void pull_potential_wrapper(FILE *fplog,
       * The virial contribution is calculated directly,
       * which is why we call pull_potential after calc_virial.
       */
+    wallcycle_start(wcycle, ewcPULLPOT);
      set_pbc(&pbc, ir->ePBC, box);
      dvdl                     = 0;
      enerd->term[F_COM_PULL] +=
          pull_potential(ir->ePull, ir->pull, mdatoms, &pbc,
                         cr, t, lambda[efptRESTRAINT], x, f, vir_force, &dvdl);
-    if (bSepDVDL)
-    {
-        gmx_print_sepdvdl(fplog, "Com pull", enerd->term[F_COM_PULL], dvdl);
-    }
      enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+    wallcycle_stop(wcycle, ewcPULLPOT);
  }
  
-static void pme_receive_force_ener(FILE           *fplog,
-                                   gmx_bool        bSepDVDL,
-                                   t_commrec      *cr,
+static void pme_receive_force_ener(t_commrec      *cr,
                                     gmx_wallcycle_t wcycle,
                                     gmx_enerdata_t *enerd,
                                     t_forcerec     *fr)
  {
-    real   e, v, dvdl;
+    real   e_q, e_lj, v, dvdl_q, dvdl_lj;
      float  cycles_ppdpme, cycles_seppme;
  
      cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
@@ -489,15 +424,16 @@ static void pme_receive_force_ener(FILE           *fplog,
       * forces, virial and energy from the PME nodes here.
       */
      wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
-    dvdl = 0;
-    gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e, &dvdl,
+    dvdl_q  = 0;
+    dvdl_lj = 0;
+    gmx_pme_receive_f(cr, fr->f_novirsum, fr->vir_el_recip, &e_q,
+                      fr->vir_lj_recip, &e_lj, &dvdl_q, &dvdl_lj,
                        &cycles_seppme);
-    if (bSepDVDL)
-    {
-        gmx_print_sepdvdl(fplog, "PME mesh", e, dvdl);
-    }
-    enerd->term[F_COUL_RECIP] += e;
-    enerd->dvdl_lin[efptCOUL] += dvdl;
+    enerd->term[F_COUL_RECIP] += e_q;
+    enerd->term[F_LJ_RECIP]   += e_lj;
+    enerd->dvdl_lin[efptCOUL] += dvdl_q;
+    enerd->dvdl_lin[efptVDW]  += dvdl_lj;
+
      if (wcycle)
      {
          dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
@@ -506,14 +442,14 @@ static void pme_receive_force_ener(FILE           *fplog,
  }
  
  static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
-                               gmx_large_int_t step, real pforce, rvec *x, rvec *f)
+                               gmx_int64_t step, real pforce, rvec *x, rvec *f)
  {
      int  i;
      real pf2, fn2;
      char buf[STEPSTRSIZE];
  
      pf2 = sqr(pforce);
-    for (i = md->start; i < md->start+md->homenr; i++)
+    for (i = 0; i < md->homenr; i++)
      {
          fn2 = norm2(f[i]);
          /* We also catch NAN, if the compiler does not optimize this away. */
@@ -527,7 +463,7 @@ static void print_large_forces(FILE *fp, t_mdatoms *md, t_commrec *cr,
  }
  
  static void post_process_forces(t_commrec *cr,
-                                gmx_large_int_t step,
+                                gmx_int64_t step,
                                  t_nrnb *nrnb, gmx_wallcycle_t wcycle,
                                  gmx_localtop_t *top,
                                  matrix box, rvec x[],
@@ -562,7 +498,7 @@ static void post_process_forces(t_commrec *cr,
              }
              else
              {
-                sum_forces(mdatoms->start, mdatoms->start+mdatoms->homenr,
+                sum_forces(0, mdatoms->homenr,
                             f, fr->f_novirsum);
              }
              if (EEL_FULL(fr->eeltype))
@@ -570,6 +506,11 @@ static void post_process_forces(t_commrec *cr,
                  /* Add the mesh contribution to the virial */
                  m_add(vir_force, fr->vir_el_recip, vir_force);
              }
+            if (EVDW_PME(fr->vdwtype))
+            {
+                /* Add the mesh contribution to the virial */
+                m_add(vir_force, fr->vir_lj_recip, vir_force);
+            }
              if (debug)
              {
                  pr_rvecs(debug, 0, "vir_force", vir_force, DIM);
@@ -588,12 +529,13 @@ static void do_nb_verlet(t_forcerec *fr,
                           gmx_enerdata_t *enerd,
                           int flags, int ilocality,
                           int clearF,
-                         t_nrnb *nrnb)
+                         t_nrnb *nrnb,
+                         gmx_wallcycle_t wcycle)
  {
      int                        nnbl, kernel_type, enr_nbnxn_kernel_ljc, enr_nbnxn_kernel_lj;
      char                      *env;
      nonbonded_verlet_group_t  *nbvg;
-    gmx_bool                  bCUDA;
+    gmx_bool                   bCUDA;
  
      if (!(flags & GMX_FORCE_NONBONDED))
      {
@@ -709,13 +651,146 @@ static void do_nb_verlet(t_forcerec *fr,
               nbvg->nbl_lists.natpair_ljq);
      inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
               nbvg->nbl_lists.natpair_lj);
+    /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
      inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
               nbvg->nbl_lists.natpair_q);
+
+    if (ic->vdw_modifier == eintmodFORCESWITCH)
+    {
+        /* We add up the switch cost separately */
+        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
+                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
+    }
+    if (ic->vdw_modifier == eintmodPOTSWITCH)
+    {
+        /* We add up the switch cost separately */
+        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
+                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
+    }
+    if (ic->vdwtype == evdwPME)
+    {
+        /* We add up the LJ Ewald cost separately */
+        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
+                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
+    }
+}
+
+static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
+                             t_forcerec           *fr,
+                             rvec                  x[],
+                             rvec                  f[],
+                             t_mdatoms            *mdatoms,
+                             t_lambda             *fepvals,
+                             real                 *lambda,
+                             gmx_enerdata_t       *enerd,
+                             int                   flags,
+                             t_nrnb               *nrnb,
+                             gmx_wallcycle_t       wcycle)
+{
+    int              donb_flags;
+    nb_kernel_data_t kernel_data;
+    real             lam_i[efptNR];
+    real             dvdl_nb[efptNR];
+    int              th;
+    int              i, j;
+
+    donb_flags = 0;
+    /* Add short-range interactions */
+    donb_flags |= GMX_NONBONDED_DO_SR;
+
+    /* Currently all group scheme kernels always calculate (shift-)forces */
+    if (flags & GMX_FORCE_FORCES)
+    {
+        donb_flags |= GMX_NONBONDED_DO_FORCE;
+    }
+    if (flags & GMX_FORCE_VIRIAL)
+    {
+        donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
+    }
+    if (flags & GMX_FORCE_ENERGY)
+    {
+        donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
+    }
+    if (flags & GMX_FORCE_DO_LR)
+    {
+        donb_flags |= GMX_NONBONDED_DO_LR;
+    }
+
+    kernel_data.flags  = donb_flags;
+    kernel_data.lambda = lambda;
+    kernel_data.dvdl   = dvdl_nb;
+
+    kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
+    kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
+
+    /* reset free energy components */
+    for (i = 0; i < efptNR; i++)
+    {
+        dvdl_nb[i]  = 0;
+    }
+
+    assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
+
+    wallcycle_sub_start(wcycle, ewcsNONBONDED);
+#pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
+    for (th = 0; th < nbl_lists->nnbl; th++)
+    {
+        gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
+                                  x, f, fr, mdatoms, &kernel_data, nrnb);
+    }
+
+    if (fepvals->sc_alpha != 0)
+    {
+        enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+
+    /* If we do foreign lambda and we have soft-core interactions
+     * we have to recalculate the (non-linear) energies contributions.
+     */
+    if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
+    {
+        kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
+        kernel_data.lambda         = lam_i;
+        kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
+        kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
+        /* Note that we add to kernel_data.dvdl, but ignore the result */
+
+        for (i = 0; i < enerd->n_lambda; i++)
+        {
+            for (j = 0; j < efptNR; j++)
+            {
+                lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
+            }
+            reset_foreign_enerdata(enerd);
+#pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
+            for (th = 0; th < nbl_lists->nnbl; th++)
+            {
+                gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
+                                          x, f, fr, mdatoms, &kernel_data, nrnb);
+            }
+
+            sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
+            enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
+        }
+    }
+
+    wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+}
+
+gmx_bool use_GPU(const nonbonded_verlet_t *nbv)
+{
+    return nbv != NULL && nbv->bUseGPU;
  }
  
  void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                           t_inputrec *inputrec,
-                         gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                         gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
                           gmx_localtop_t *top,
                           gmx_groups_t gmx_unused *groups,
                           matrix box, rvec x[], history_t *hist,
@@ -734,24 +809,23 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
      int                 start, homenr;
      int                 nb_kernel_type;
      double              mu[2*DIM];
-    gmx_bool            bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+    gmx_bool            bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
      gmx_bool            bDoLongRange, bDoForces, bSepLRF, bUseGPU, bUseOrEmulGPU;
      gmx_bool            bDiffKernels = FALSE;
      matrix              boxs;
      rvec                vzero, box_diag;
      real                e, v, dvdl;
-    float               cycles_pme, cycles_force;
+    float               cycles_pme, cycles_force, cycles_wait_gpu;
      nonbonded_verlet_t *nbv;
  
-    cycles_force   = 0;
-    nbv            = fr->nbv;
-    nb_kernel_type = fr->nbv->grp[0].kernel_type;
+    cycles_force    = 0;
+    cycles_wait_gpu = 0;
+    nbv             = fr->nbv;
+    nb_kernel_type  = fr->nbv->grp[0].kernel_type;
  
-    start  = mdatoms->start;
+    start  = 0;
      homenr = mdatoms->homenr;
  
-    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
-
      clear_mat(vir_force);
  
      cg0 = 0;
@@ -826,6 +900,8 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
           * we do not need to worry about shifting.
           */
  
+        int pme_flags = 0;
+
          wallcycle_start(wcycle, ewcPP_PMESENDX);
  
          bBS = (inputrec->nwall == 2);
@@ -835,9 +911,20 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
          }
  
-        gmx_pme_send_x(cr, bBS ? boxs : box, x,
-                       mdatoms->nChargePerturbed, lambda[efptCOUL],
-                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
+        if (EEL_PME(fr->eeltype))
+        {
+            pme_flags |= GMX_PME_DO_COULOMB;
+        }
+
+        if (EVDW_PME(fr->vdwtype))
+        {
+            pme_flags |= GMX_PME_DO_LJ;
+        }
+
+        gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
+                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed, lambda[efptCOUL], lambda[efptVDW],
+                                 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
+                                 pme_flags, step);
  
          wallcycle_stop(wcycle, ewcPP_PMESENDX);
      }
@@ -949,7 +1036,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
          /* launch local nonbonded F on GPU */
          do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFNo,
-                     nrnb);
+                     nrnb, wcycle);
          wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
      }
  
@@ -1030,7 +1117,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              wallcycle_start(wcycle, ewcLAUNCH_GPU_NB);
              /* launch non-local nonbonded F on GPU */
              do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFNo,
-                         nrnb);
+                         nrnb, wcycle);
              cycles_force += wallcycle_stop(wcycle, ewcLAUNCH_GPU_NB);
          }
      }
@@ -1082,13 +1169,10 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
      reset_enerdata(fr, bNS, enerd, MASTER(cr));
      clear_rvecs(SHIFTS, fr->fshift);
  
-    if (DOMAINDECOMP(cr))
+    if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
      {
-        if (!(cr->duty & DUTY_PME))
-        {
-            wallcycle_start(wcycle, ewcPPDURINGPME);
-            dd_force_flop_start(cr->dd, nrnb);
-        }
+        wallcycle_start(wcycle, ewcPPDURINGPME);
+        dd_force_flop_start(cr->dd, nrnb);
      }
  
      if (inputrec->bRot)
@@ -1161,7 +1245,30 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
      {
          /* Maybe we should move this into do_force_lowlevel */
          do_nb_verlet(fr, ic, enerd, flags, eintLocal, enbvClearFYes,
-                     nrnb);
+                     nrnb, wcycle);
+    }
+
+    if (fr->efep != efepNO)
+    {
+        /* Calculate the local and non-local free energy interactions here.
+         * Happens here on the CPU both with and without GPU.
+         */
+        if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
+        {
+            do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
+                             fr, x, f, mdatoms,
+                             inputrec->fepvals, lambda,
+                             enerd, flags, nrnb, wcycle);
+        }
+
+        if (DOMAINDECOMP(cr) &&
+            fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
+        {
+            do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
+                             fr, x, f, mdatoms,
+                             inputrec->fepvals, lambda,
+                             enerd, flags, nrnb, wcycle);
+        }
      }
  
      if (!bUseOrEmulGPU || bDiffKernels)
@@ -1172,7 +1279,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          {
              do_nb_verlet(fr, ic, enerd, flags, eintNonlocal,
                           bDiffKernels ? enbvClearFYes : enbvClearFNo,
-                         nrnb);
+                         nrnb, wcycle);
          }
  
          if (!bUseOrEmulGPU)
@@ -1213,15 +1320,20 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
      if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
      {
-        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+        posres_wrapper(flags, inputrec, nrnb, top, box, x,
                         enerd, lambda, fr);
      }
  
+    if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
+    {
+        fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
+    }
+
      /* Compute the bonded and non-bonded energies and optionally forces */
-    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+    do_force_lowlevel(fr, inputrec, &(top->idef),
                        cr, nrnb, wcycle, mdatoms,
                        x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
-                      &(top->atomtypes), bBornRadii, box,
+                      bBornRadii, box,
                        inputrec->fepvals, lambda, graph, &(top->excls), fr->mu_tot,
                        flags, &cycles_pme);
  
@@ -1251,19 +1363,23 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          {
              if (bUseGPU)
              {
+                float cycles_tmp;
+
                  wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
                  nbnxn_cuda_wait_gpu(nbv->cu_nbv,
                                      nbv->grp[eintNonlocal].nbat,
                                      flags, eatNonlocal,
                                      enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                      fr->fshift);
-                cycles_force += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+                cycles_tmp       = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+                cycles_wait_gpu += cycles_tmp;
+                cycles_force    += cycles_tmp;
              }
              else
              {
                  wallcycle_start_nocount(wcycle, ewcFORCE);
                  do_nb_verlet(fr, ic, enerd, flags, eintNonlocal, enbvClearFYes,
-                             nrnb);
+                             nrnb, wcycle);
                  cycles_force += wallcycle_stop(wcycle, ewcFORCE);
              }
              wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
@@ -1279,37 +1395,31 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          }
      }
  
-    if (bDoForces)
+    if (bDoForces && DOMAINDECOMP(cr))
      {
          /* Communicate the forces */
-        if (PAR(cr))
+        wallcycle_start(wcycle, ewcMOVEF);
+        dd_move_f(cr->dd, f, fr->fshift);
+        /* Do we need to communicate the separate force array
+         * for terms that do not contribute to the single sum virial?
+         * Position restraints and electric fields do not introduce
+         * inter-cg forces, only full electrostatics methods do.
+         * When we do not calculate the virial, fr->f_novirsum = f,
+         * so we have already communicated these forces.
+         */
+        if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+            (flags & GMX_FORCE_VIRIAL))
          {
-            wallcycle_start(wcycle, ewcMOVEF);
-            if (DOMAINDECOMP(cr))
-            {
-                dd_move_f(cr->dd, f, fr->fshift);
-                /* Do we need to communicate the separate force array
-                 * for terms that do not contribute to the single sum virial?
-                 * Position restraints and electric fields do not introduce
-                 * inter-cg forces, only full electrostatics methods do.
-                 * When we do not calculate the virial, fr->f_novirsum = f,
-                 * so we have already communicated these forces.
-                 */
-                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
-                    (flags & GMX_FORCE_VIRIAL))
-                {
-                    dd_move_f(cr->dd, fr->f_novirsum, NULL);
-                }
-                if (bSepLRF)
-                {
-                    /* We should not update the shift forces here,
-                     * since f_twin is already included in f.
-                     */
-                    dd_move_f(cr->dd, fr->f_twin, NULL);
-                }
-            }
-            wallcycle_stop(wcycle, ewcMOVEF);
+            dd_move_f(cr->dd, fr->f_novirsum, NULL);
+        }
+        if (bSepLRF)
+        {
+            /* We should not update the shift forces here,
+             * since f_twin is already included in f.
+             */
+            dd_move_f(cr->dd, fr->f_twin, NULL);
          }
+        wallcycle_stop(wcycle, ewcMOVEF);
      }
  
      if (bUseOrEmulGPU)
@@ -1323,7 +1433,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                                  flags, eatLocal,
                                  enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                  fr->fshift);
-            wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
  
              /* now clear the GPU outputs while we finish the step on the CPU */
  
@@ -1336,7 +1446,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
              wallcycle_start_nocount(wcycle, ewcFORCE);
              do_nb_verlet(fr, ic, enerd, flags, eintLocal,
                           DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
-                         nrnb);
+                         nrnb, wcycle);
              wallcycle_stop(wcycle, ewcFORCE);
          }
          wallcycle_start(wcycle, ewcNB_XF_BUF_OPS);
@@ -1357,6 +1467,10 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          if (wcycle)
          {
              dd_cycles_add(cr->dd, cycles_force-cycles_pme, ddCyclF);
+            if (bUseGPU)
+            {
+                dd_cycles_add(cr->dd, cycles_wait_gpu, ddCyclWaitGPU);
+            }
          }
      }
  
@@ -1393,15 +1507,19 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          if (flags & GMX_FORCE_VIRIAL)
          {
              /* Calculation of the virial must be done after vsites! */
-            calc_virial(mdatoms->start, mdatoms->homenr, x, f,
+            calc_virial(0, mdatoms->homenr, x, f,
                          vir_force, graph, box, nrnb, fr, inputrec->ePBC);
          }
      }
  
      if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
      {
-        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
-                               f, vir_force, mdatoms, enerd, lambda, t);
+        /* Since the COM pulling is always done mass-weighted, no forces are
+         * applied to vsites and this call can be done after vsite spreading.
+         */
+        pull_potential_wrapper(cr, inputrec, box, x,
+                               f, vir_force, mdatoms, enerd, lambda, t,
+                               wcycle);
      }
  
      /* Add the forces from enforced rotation potentials (if any) */
@@ -1412,12 +1530,15 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          wallcycle_stop(wcycle, ewcROTadd);
      }
  
+    /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
+    IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
+
      if (PAR(cr) && !(cr->duty & DUTY_PME))
      {
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
           */
-        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+        pme_receive_force_ener(cr, wcycle, enerd, fr);
      }
  
      if (bDoForces)
@@ -1433,7 +1554,7 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
  void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
                          t_inputrec *inputrec,
-                        gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+                        gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
                          gmx_localtop_t *top,
                          gmx_groups_t *groups,
                          matrix box, rvec x[], history_t *hist,
@@ -1450,7 +1571,7 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
      int        cg0, cg1, i, j;
      int        start, homenr;
      double     mu[2*DIM];
-    gmx_bool   bSepDVDL, bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
+    gmx_bool   bStateChanged, bNS, bFillGrid, bCalcCGCM, bBS;
      gmx_bool   bDoLongRangeNS, bDoForces, bDoPotential, bSepLRF;
      gmx_bool   bDoAdressWF;
      matrix     boxs;
@@ -1459,32 +1580,23 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
      t_pbc      pbc;
      float      cycles_pme, cycles_force;
  
-    start  = mdatoms->start;
+    start  = 0;
      homenr = mdatoms->homenr;
  
-    bSepDVDL = (fr->bSepDVDL && do_per_step(step, inputrec->nstlog));
-
      clear_mat(vir_force);
  
-    if (PARTDECOMP(cr))
+    cg0 = 0;
+    if (DOMAINDECOMP(cr))
      {
-        pd_cg_range(cr, &cg0, &cg1);
+        cg1 = cr->dd->ncg_tot;
      }
      else
      {
-        cg0 = 0;
-        if (DOMAINDECOMP(cr))
-        {
-            cg1 = cr->dd->ncg_tot;
-        }
-        else
-        {
-            cg1 = top->cgs.nr;
-        }
-        if (fr->n_tpi > 0)
-        {
-            cg1--;
-        }
+        cg1 = top->cgs.nr;
+    }
+    if (fr->n_tpi > 0)
+    {
+        cg1--;
      }
  
      bStateChanged  = (flags & GMX_FORCE_STATECHANGED);
@@ -1545,16 +1657,9 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
          inc_nrnb(nrnb, eNR_CGCM, homenr);
      }
  
-    if (bCalcCGCM)
+    if (bCalcCGCM && gmx_debug_at)
      {
-        if (PAR(cr))
-        {
-            move_cgcm(fplog, cr, fr->cg_cm);
-        }
-        if (gmx_debug_at)
-        {
-            pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
-        }
+        pr_rvecs(debug, 0, "cgcm", fr->cg_cm, top->cgs.nr);
      }
  
  #ifdef GMX_MPI
@@ -1566,6 +1671,8 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
           * we do not need to worry about shifting.
           */
  
+        int pme_flags = 0;
+
          wallcycle_start(wcycle, ewcPP_PMESENDX);
  
          bBS = (inputrec->nwall == 2);
@@ -1575,26 +1682,30 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
              svmul(inputrec->wall_ewald_zfac, boxs[ZZ], boxs[ZZ]);
          }
  
-        gmx_pme_send_x(cr, bBS ? boxs : box, x,
-                       mdatoms->nChargePerturbed, lambda[efptCOUL],
-                       (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)), step);
+        if (EEL_PME(fr->eeltype))
+        {
+            pme_flags |= GMX_PME_DO_COULOMB;
+        }
+
+        if (EVDW_PME(fr->vdwtype))
+        {
+            pme_flags |= GMX_PME_DO_LJ;
+        }
+
+        gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
+                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed, lambda[efptCOUL], lambda[efptVDW],
+                                 (flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)),
+                                 pme_flags, step);
  
          wallcycle_stop(wcycle, ewcPP_PMESENDX);
      }
  #endif /* GMX_MPI */
  
      /* Communicate coordinates and sum dipole if necessary */
-    if (PAR(cr))
+    if (DOMAINDECOMP(cr))
      {
          wallcycle_start(wcycle, ewcMOVEX);
-        if (DOMAINDECOMP(cr))
-        {
-            dd_move_x(cr->dd, box, x);
-        }
-        else
-        {
-            move_x(cr, x, nrnb);
-        }
+        dd_move_x(cr->dd, box, x);
          wallcycle_stop(wcycle, ewcMOVEX);
      }
  
@@ -1685,13 +1796,10 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
                         x, box, fr, &top->idef, graph, fr->born);
      }
  
-    if (DOMAINDECOMP(cr))
+    if (DOMAINDECOMP(cr) && !(cr->duty & DUTY_PME))
      {
-        if (!(cr->duty & DUTY_PME))
-        {
-            wallcycle_start(wcycle, ewcPPDURINGPME);
-            dd_force_flop_start(cr->dd, nrnb);
-        }
+        wallcycle_start(wcycle, ewcPPDURINGPME);
+        dd_force_flop_start(cr->dd, nrnb);
      }
  
      if (inputrec->bRot)
@@ -1761,31 +1869,20 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
  
      if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_POSRES].nr > 0)
      {
-        posres_wrapper(fplog, flags, bSepDVDL, inputrec, nrnb, top, box, x,
+        posres_wrapper(flags, inputrec, nrnb, top, box, x,
                         enerd, lambda, fr);
      }
  
      if ((flags & GMX_FORCE_BONDED) && top->idef.il[F_FBPOSRES].nr > 0)
      {
-        /* Flat-bottomed position restraints always require full pbc */
-        if (!(bStateChanged && bDoAdressWF))
-        {
-            set_pbc(&pbc, inputrec->ePBC, box);
-        }
-        v = fbposres(top->idef.il[F_FBPOSRES].nr, top->idef.il[F_FBPOSRES].iatoms,
-                     top->idef.iparams_fbposres,
-                     (const rvec*)x, fr->f_novirsum, fr->vir_diag_posres,
-                     inputrec->ePBC == epbcNONE ? NULL : &pbc,
-                     fr->rc_scaling, fr->ePBC, fr->posres_com);
-        enerd->term[F_FBPOSRES] += v;
-        inc_nrnb(nrnb, eNR_FBPOSRES, top->idef.il[F_FBPOSRES].nr/2);
+        fbposres_wrapper(inputrec, nrnb, top, box, x, enerd, fr);
      }
  
      /* Compute the bonded and non-bonded energies and optionally forces */
-    do_force_lowlevel(fplog, step, fr, inputrec, &(top->idef),
+    do_force_lowlevel(fr, inputrec, &(top->idef),
                        cr, nrnb, wcycle, mdatoms,
                        x, hist, f, bSepLRF ? fr->f_twin : f, enerd, fcd, top, fr->born,
-                      &(top->atomtypes), bBornRadii, box,
+                      bBornRadii, box,
                        inputrec->fepvals, lambda,
                        graph, &(top->excls), fr->mu_tot,
                        flags,
@@ -1837,39 +1934,28 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
          }
  
          /* Communicate the forces */
-        if (PAR(cr))
+        if (DOMAINDECOMP(cr))
          {
              wallcycle_start(wcycle, ewcMOVEF);
-            if (DOMAINDECOMP(cr))
+            dd_move_f(cr->dd, f, fr->fshift);
+            /* Do we need to communicate the separate force array
+             * for terms that do not contribute to the single sum virial?
+             * Position restraints and electric fields do not introduce
+             * inter-cg forces, only full electrostatics methods do.
+             * When we do not calculate the virial, fr->f_novirsum = f,
+             * so we have already communicated these forces.
+             */
+            if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
+                (flags & GMX_FORCE_VIRIAL))
              {
-                dd_move_f(cr->dd, f, fr->fshift);
-                /* Do we need to communicate the separate force array
-                 * for terms that do not contribute to the single sum virial?
-                 * Position restraints and electric fields do not introduce
-                 * inter-cg forces, only full electrostatics methods do.
-                 * When we do not calculate the virial, fr->f_novirsum = f,
-                 * so we have already communicated these forces.
-                 */
-                if (EEL_FULL(fr->eeltype) && cr->dd->n_intercg_excl &&
-                    (flags & GMX_FORCE_VIRIAL))
-                {
-                    dd_move_f(cr->dd, fr->f_novirsum, NULL);
-                }
-                if (bSepLRF)
-                {
-                    /* We should not update the shift forces here,
-                     * since f_twin is already included in f.
-                     */
-                    dd_move_f(cr->dd, fr->f_twin, NULL);
-                }
+                dd_move_f(cr->dd, fr->f_novirsum, NULL);
              }
-            else
+            if (bSepLRF)
              {
-                pd_move_f(cr, f, nrnb);
-                if (bSepLRF)
-                {
-                    pd_move_f(cr, fr->f_twin, nrnb);
-                }
+                /* We should not update the shift forces here,
+                 * since f_twin is already included in f.
+                 */
+                dd_move_f(cr->dd, fr->f_twin, NULL);
              }
              wallcycle_stop(wcycle, ewcMOVEF);
          }
@@ -1897,15 +1983,16 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
          if (flags & GMX_FORCE_VIRIAL)
          {
              /* Calculation of the virial must be done after vsites! */
-            calc_virial(mdatoms->start, mdatoms->homenr, x, f,
+            calc_virial(0, mdatoms->homenr, x, f,
                          vir_force, graph, box, nrnb, fr, inputrec->ePBC);
          }
      }
  
      if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
      {
-        pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
-                               f, vir_force, mdatoms, enerd, lambda, t);
+        pull_potential_wrapper(cr, inputrec, box, x,
+                               f, vir_force, mdatoms, enerd, lambda, t,
+                               wcycle);
      }
  
      /* Add the forces from enforced rotation potentials (if any) */
@@ -1916,12 +2003,15 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
          wallcycle_stop(wcycle, ewcROTadd);
      }
  
+    /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
+    IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
+
      if (PAR(cr) && !(cr->duty & DUTY_PME))
      {
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
           */
-        pme_receive_force_ener(fplog, bSepDVDL, cr, wcycle, enerd, fr);
+        pme_receive_force_ener(cr, wcycle, enerd, fr);
      }
  
      if (bDoForces)
@@ -1937,7 +2027,7 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
  
  void do_force(FILE *fplog, t_commrec *cr,
                t_inputrec *inputrec,
-              gmx_large_int_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
+              gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
                gmx_localtop_t *top,
                gmx_groups_t *groups,
                matrix box, rvec x[], history_t *hist,
@@ -2003,15 +2093,15 @@ void do_constrain_first(FILE *fplog, gmx_constr_t constr,
                          t_forcerec *fr, gmx_localtop_t *top)
  {
      int             i, m, start, end;
-    gmx_large_int_t step;
+    gmx_int64_t     step;
      real            dt = ir->delta_t;
      real            dvdl_dum;
      rvec           *savex;
  
      snew(savex, state->natoms);
  
-    start = md->start;
-    end   = md->homenr + start;
+    start = 0;
+    end   = md->homenr;
  
      if (debug)
      {
@@ -2030,7 +2120,7 @@ void do_constrain_first(FILE *fplog, gmx_constr_t constr,
  
      /* constrain the current position */
      constrain(NULL, TRUE, FALSE, constr, &(top->idef),
-              ir, NULL, cr, step, 0, md,
+              ir, NULL, cr, step, 0, 1.0, md,
                state->x, state->x, NULL,
                fr->bMolPBC, state->box,
                state->lambda[efptBONDED], &dvdl_dum,
@@ -2042,7 +2132,7 @@ void do_constrain_first(FILE *fplog, gmx_constr_t constr,
          /* also may be useful if we need the ekin from the halfstep for velocity verlet */
          /* might not yet treat veta correctly */
          constrain(NULL, TRUE, FALSE, constr, &(top->idef),
-                  ir, NULL, cr, step, 0, md,
+                  ir, NULL, cr, step, 0, 1.0, md,
                    state->x, state->v, state->v,
                    fr->bMolPBC, state->box,
                    state->lambda[efptBONDED], &dvdl_dum,
@@ -2073,7 +2163,7 @@ void do_constrain_first(FILE *fplog, gmx_constr_t constr,
          }
          dvdl_dum = 0;
          constrain(NULL, TRUE, FALSE, constr, &(top->idef),
-                  ir, NULL, cr, step, -1, md,
+                  ir, NULL, cr, step, -1, 1.0, md,
                    state->x, savex, NULL,
                    fr->bMolPBC, state->box,
                    state->lambda[efptBONDED], &dvdl_dum,
@@ -2092,13 +2182,81 @@ void do_constrain_first(FILE *fplog, gmx_constr_t constr,
      sfree(savex);
  }
  
-void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
+
+static void
+integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
+                double *enerout, double *virout)
  {
-    double eners[2], virs[2], enersum, virsum, y0, f, g, h;
-    double r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
+    double enersum, virsum;
      double invscale, invscale2, invscale3;
-    int    ri0, ri1, ri, i, offstart, offset;
-    real   scale, *vdwtab, tabfactor, tmp;
+    double r, ea, eb, ec, pa, pb, pc, pd;
+    double y0, f, g, h;
+    int    ri, offset, tabfactor;
+
+    invscale  = 1.0/scale;
+    invscale2 = invscale*invscale;
+    invscale3 = invscale*invscale2;
+
+    /* Following summation derived from cubic spline definition,
+     * Numerical Recipies in C, second edition, p. 113-116.  Exact for
+     * the cubic spline.  We first calculate the negative of the
+     * energy from rvdw to rvdw_switch, assuming that g(r)=1, and then
+     * add the more standard, abrupt cutoff correction to that result,
+     * yielding the long-range correction for a switched function.  We
+     * perform both the pressure and energy loops at the same time for
+     * simplicity, as the computational cost is low. */
+
+    if (offstart == 0)
+    {
+        /* Since the dispersion table has been scaled down a factor
+         * 6.0 and the repulsion a factor 12.0 to compensate for the
+         * c6/c12 parameters inside nbfp[] being scaled up (to save
+         * flops in kernels), we need to correct for this.
+         */
+        tabfactor = 6.0;
+    }
+    else
+    {
+        tabfactor = 12.0;
+    }
+
+    enersum = 0.0;
+    virsum  = 0.0;
+    for (ri = rstart; ri < rend; ++ri)
+    {
+        r  = ri*invscale;
+        ea = invscale3;
+        eb = 2.0*invscale2*r;
+        ec = invscale*r*r;
+
+        pa = invscale3;
+        pb = 3.0*invscale2*r;
+        pc = 3.0*invscale*r*r;
+        pd = r*r*r;
+
+        /* this "8" is from the packing in the vdwtab array - perhaps
+           should be defined? */
+
+        offset = 8*ri + offstart;
+        y0     = vdwtab[offset];
+        f      = vdwtab[offset+1];
+        g      = vdwtab[offset+2];
+        h      = vdwtab[offset+3];
+
+        enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
+        virsum  +=  f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
+    }
+    *enerout = 4.0*M_PI*enersum*tabfactor;
+    *virout  = 4.0*M_PI*virsum*tabfactor;
+}
+
+void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
+{
+    double   eners[2], virs[2], enersum, virsum, y0, f, g, h;
+    double   r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
+    double   invscale, invscale2, invscale3;
+    int      ri0, ri1, ri, i, offstart, offset;
+    real     scale, *vdwtab, tabfactor, tmp;
  
      fr->enershiftsix    = 0;
      fr->enershifttwelve = 0;
@@ -2114,27 +2272,53 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
              eners[i] = 0;
              virs[i]  = 0;
          }
-        if ((fr->vdwtype == evdwSWITCH) || (fr->vdwtype == evdwSHIFT))
+        if ((fr->vdw_modifier == eintmodPOTSHIFT) ||
+            (fr->vdw_modifier == eintmodPOTSWITCH) ||
+            (fr->vdw_modifier == eintmodFORCESWITCH) ||
+            (fr->vdwtype == evdwSHIFT) ||
+            (fr->vdwtype == evdwSWITCH))
          {
-            if (fr->rvdw_switch == 0)
+            if (((fr->vdw_modifier == eintmodPOTSWITCH) ||
+                 (fr->vdw_modifier == eintmodFORCESWITCH) ||
+                 (fr->vdwtype == evdwSWITCH)) && fr->rvdw_switch == 0)
              {
                  gmx_fatal(FARGS,
                            "With dispersion correction rvdw-switch can not be zero "
                            "for vdw-type = %s", evdw_names[fr->vdwtype]);
              }
  
-            scale  = fr->nblists[0].table_elec_vdw.scale;
+            scale  = fr->nblists[0].table_vdw.scale;
              vdwtab = fr->nblists[0].table_vdw.data;
  
              /* Round the cut-offs to exact table values for precision */
              ri0  = floor(fr->rvdw_switch*scale);
              ri1  = ceil(fr->rvdw*scale);
+
+            /* The code below has some support for handling force-switching, i.e.
+             * when the force (instead of potential) is switched over a limited
+             * region. This leads to a constant shift in the potential inside the
+             * switching region, which we can handle by adding a constant energy
+             * term in the force-switch case just like when we do potential-shift.
+             *
+             * For now this is not enabled, but to keep the functionality in the
+             * code we check separately for switch and shift. When we do force-switch
+             * the shifting point is rvdw_switch, while it is the cutoff when we
+             * have a classical potential-shift.
+             *
+             * For a pure potential-shift the potential has a constant shift
+             * all the way out to the cutoff, and that is it. For other forms
+             * we need to calculate the constant shift up to the point where we
+             * start modifying the potential.
+             */
+            ri0  = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
+
              r0   = ri0/scale;
              r1   = ri1/scale;
              rc3  = r0*r0*r0;
              rc9  = rc3*rc3*rc3;
  
-            if (fr->vdwtype == evdwSHIFT)
+            if ((fr->vdw_modifier == eintmodFORCESWITCH) ||
+                (fr->vdwtype == evdwSHIFT))
              {
                  /* Determine the constant energy shift below rvdw_switch.
                   * Table has a scale factor since we have scaled it down to compensate
@@ -2143,6 +2327,12 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
                  fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
                  fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
              }
+            else if (fr->vdw_modifier == eintmodPOTSHIFT)
+            {
+                fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
+                fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
+            }
+
              /* Add the constant part from 0 to rvdw_switch.
               * This integration from 0 to rvdw_switch overcounts the number
               * of interactions by 1, as it also counts the self interaction.
@@ -2151,79 +2341,49 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
              eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
              eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
  
-            invscale  = 1.0/(scale);
-            invscale2 = invscale*invscale;
-            invscale3 = invscale*invscale2;
-
-            /* following summation derived from cubic spline definition,
-               Numerical Recipies in C, second edition, p. 113-116.  Exact
-               for the cubic spline.  We first calculate the negative of
-               the energy from rvdw to rvdw_switch, assuming that g(r)=1,
-               and then add the more standard, abrupt cutoff correction to
-               that result, yielding the long-range correction for a
-               switched function.  We perform both the pressure and energy
-               loops at the same time for simplicity, as the computational
-               cost is low. */
-
+            /* Calculate the contribution in the range [r0,r1] where we
+             * modify the potential. For a pure potential-shift modifier we will
+             * have ri0==ri1, and there will not be any contribution here.
+             */
              for (i = 0; i < 2; i++)
              {
-                enersum = 0.0; virsum = 0.0;
-                if (i == 0)
-                {
-                    offstart = 0;
-                    /* Since the dispersion table has been scaled down a factor 6.0 and the repulsion
-                     * a factor 12.0 to compensate for the c6/c12 parameters inside nbfp[] being scaled
-                     * up (to save flops in kernels), we need to correct for this.
-                     */
-                    tabfactor = 6.0;
-                }
-                else
-                {
-                    offstart  = 4;
-                    tabfactor = 12.0;
-                }
-                for (ri = ri0; ri < ri1; ri++)
-                {
-                    r  = ri*invscale;
-                    ea = invscale3;
-                    eb = 2.0*invscale2*r;
-                    ec = invscale*r*r;
-
-                    pa = invscale3;
-                    pb = 3.0*invscale2*r;
-                    pc = 3.0*invscale*r*r;
-                    pd = r*r*r;
-
-                    /* this "8" is from the packing in the vdwtab array - perhaps should be #define'ed? */
-                    offset = 8*ri + offstart;
-                    y0     = vdwtab[offset];
-                    f      = vdwtab[offset+1];
-                    g      = vdwtab[offset+2];
-                    h      = vdwtab[offset+3];
-
-                    enersum += y0*(ea/3 + eb/2 + ec) + f*(ea/4 + eb/3 + ec/2) + g*(ea/5 + eb/4 + ec/3) + h*(ea/6 + eb/5 + ec/4);
-                    virsum  += f*(pa/4 + pb/3 + pc/2 + pd) + 2*g*(pa/5 + pb/4 + pc/3 + pd/2) + 3*h*(pa/6 + pb/5 + pc/4 + pd/3);
-                }
-
-                enersum  *= 4.0*M_PI*tabfactor;
-                virsum   *= 4.0*M_PI*tabfactor;
+                enersum = 0;
+                virsum  = 0;
+                integrate_table(vdwtab, scale, (i == 0 ? 0 : 4), ri0, ri1, &enersum, &virsum);
                  eners[i] -= enersum;
                  virs[i]  -= virsum;
              }
  
-            /* now add the correction for rvdw_switch to infinity */
+            /* Alright: Above we compensated by REMOVING the parts outside r0
+             * corresponding to the ideal VdW 1/r6 and /r12 potentials.
+             *
+             * Regardless of whether r0 is the point where we start switching,
+             * or the cutoff where we calculated the constant shift, we include
+             * all the parts we are missing out to infinity from r0 by
+             * calculating the analytical dispersion correction.
+             */
              eners[0] += -4.0*M_PI/(3.0*rc3);
              eners[1] +=  4.0*M_PI/(9.0*rc9);
              virs[0]  +=  8.0*M_PI/rc3;
              virs[1]  += -16.0*M_PI/(3.0*rc9);
          }
-        else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER))
+        else if (fr->vdwtype == evdwCUT ||
+                 EVDW_PME(fr->vdwtype) ||
+                 fr->vdwtype == evdwUSER)
          {
              if (fr->vdwtype == evdwUSER && fplog)
              {
                  fprintf(fplog,
                          "WARNING: using dispersion correction with user tables\n");
              }
+
+            /* Note that with LJ-PME, the dispersion correction is multiplied
+             * by the difference between the actual C6 and the value of C6
+             * that would produce the combination rule.
+             * This means the normal energy and virial difference formulas
+             * can be used here.
+             */
+
              rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
              rc9  = rc3*rc3*rc3;
              /* Contribution beyond the cut-off */
@@ -2245,6 +2405,20 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
                        "Dispersion correction is not implemented for vdw-type = %s",
                        evdw_names[fr->vdwtype]);
          }
+
+        /* When we deprecate the group kernels the code below can go too */
+        if (fr->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
+        {
+            /* Calculate self-interaction coefficient (assuming that
+             * the reciprocal-space contribution is constant in the
+             * region that contributes to the self-interaction).
+             */
+            fr->enershiftsix = pow(fr->ewaldcoeff_lj, 6) / 6.0;
+
+            eners[0] += -pow(sqrt(M_PI)*fr->ewaldcoeff_lj, 3)/3.0;
+            virs[0]  +=  pow(sqrt(M_PI)*fr->ewaldcoeff_lj, 3);
+        }
+
          fr->enerdiffsix    = eners[0];
          fr->enerdifftwelve = eners[1];
          /* The 0.5 is due to the Gromacs definition of the virial */
@@ -2253,8 +2427,8 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
      }
  }
  
-void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
-                   gmx_large_int_t step, int natoms,
+void calc_dispcorr(t_inputrec *ir, t_forcerec *fr,
+                   int natoms,
                     matrix box, real lambda, tensor pres, tensor virial,
                     real *prescorr, real *enercorr, real *dvdlcorr)
  {
@@ -2355,10 +2529,6 @@ void calc_dispcorr(FILE *fplog, t_inputrec *ir, t_forcerec *fr,
              }
          }
  
-        if (fr->bSepDVDL && do_per_step(step, ir->nstlog))
-        {
-            gmx_print_sepdvdl(fplog, "Dispersion correction", *enercorr, dvdlambda);
-        }
          if (fr->efep != efepNO)
          {
              *dvdlcorr += dvdlambda;
@@ -2462,15 +2632,18 @@ void do_pbc_mtop(FILE *fplog, int ePBC, matrix box,
  void finish_run(FILE *fplog, t_commrec *cr,
                  t_inputrec *inputrec,
                  t_nrnb nrnb[], gmx_wallcycle_t wcycle,
-                gmx_runtime_t *runtime,
-                wallclock_gpu_t *gputimes,
+                gmx_walltime_accounting_t walltime_accounting,
+                nonbonded_verlet_t *nbv,
                  gmx_bool bWriteStat)
  {
      int     i, j;
      t_nrnb *nrnb_tot = NULL;
      real    delta_t;
      double  nbfs, mflop;
-
+    double  elapsed_time,
+            elapsed_time_over_all_ranks,
+            elapsed_time_over_all_threads,
+            elapsed_time_over_all_threads_over_all_ranks;
      wallcycle_sum(cr, wcycle);
  
      if (cr->nnodes > 1)
@@ -2486,14 +2659,25 @@ void finish_run(FILE *fplog, t_commrec *cr,
          nrnb_tot = nrnb;
      }
  
-#if defined(GMX_MPI) && !defined(GMX_THREAD_MPI)
+    elapsed_time                                 = walltime_accounting_get_elapsed_time(walltime_accounting);
+    elapsed_time_over_all_ranks                  = elapsed_time;
+    elapsed_time_over_all_threads                = walltime_accounting_get_elapsed_time_over_all_threads(walltime_accounting);
+    elapsed_time_over_all_threads_over_all_ranks = elapsed_time_over_all_threads;
+#ifdef GMX_MPI
      if (cr->nnodes > 1)
      {
-        /* reduce nodetime over all MPI processes in the current simulation */
-        double sum;
-        MPI_Allreduce(&runtime->proctime, &sum, 1, MPI_DOUBLE, MPI_SUM,
+        /* reduce elapsed_time over all MPI ranks in the current simulation */
+        MPI_Allreduce(&elapsed_time,
+                      &elapsed_time_over_all_ranks,
+                      1, MPI_DOUBLE, MPI_SUM,
+                      cr->mpi_comm_mysim);
+        elapsed_time_over_all_ranks /= cr->nnodes;
+        /* Reduce elapsed_time_over_all_threads over all MPI ranks in the
+         * current simulation. */
+        MPI_Allreduce(&elapsed_time_over_all_threads,
+                      &elapsed_time_over_all_threads_over_all_ranks,
+                      1, MPI_DOUBLE, MPI_SUM,
                        cr->mpi_comm_mysim);
-        runtime->proctime = sum;
      }
  #endif
  
@@ -2511,36 +2695,12 @@ void finish_run(FILE *fplog, t_commrec *cr,
          print_dd_statistics(cr, inputrec, fplog);
      }
  
-#ifdef GMX_MPI
-    if (PARTDECOMP(cr))
-    {
-        if (MASTER(cr))
-        {
-            t_nrnb     *nrnb_all;
-            int         s;
-            MPI_Status  stat;
-
-            snew(nrnb_all, cr->nnodes);
-            nrnb_all[0] = *nrnb;
-            for (s = 1; s < cr->nnodes; s++)
-            {
-                MPI_Recv(nrnb_all[s].n, eNRNB, MPI_DOUBLE, s, 0,
-                         cr->mpi_comm_mysim, &stat);
-            }
-            pr_load(fplog, cr, nrnb_all);
-            sfree(nrnb_all);
-        }
-        else
-        {
-            MPI_Send(nrnb->n, eNRNB, MPI_DOUBLE, MASTERRANK(cr), 0,
-                     cr->mpi_comm_mysim);
-        }
-    }
-#endif
-
      if (SIMMASTER(cr))
      {
-        wallcycle_print(fplog, cr->nnodes, cr->npmenodes, runtime->realtime,
+        wallclock_gpu_t* gputimes = use_GPU(nbv) ?
+            nbnxn_cuda_get_timings(nbv->cu_nbv) : NULL;
+        wallcycle_print(fplog, cr->nnodes, cr->npmenodes,
+                        elapsed_time_over_all_ranks,
                          wcycle, gputimes);
  
          if (EI_DYNAMICS(inputrec->eI))
@@ -2554,13 +2714,17 @@ void finish_run(FILE *fplog, t_commrec *cr,
  
          if (fplog)
          {
-            print_perf(fplog, runtime->proctime, runtime->realtime,
-                       runtime->nsteps_done, delta_t, nbfs, mflop);
+            print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks,
+                       elapsed_time_over_all_ranks,
+                       walltime_accounting_get_nsteps_done(walltime_accounting),
+                       delta_t, nbfs, mflop);
          }
          if (bWriteStat)
          {
-            print_perf(stderr, runtime->proctime, runtime->realtime,
-                       runtime->nsteps_done, delta_t, nbfs, mflop);
+            print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks,
+                       elapsed_time_over_all_ranks,
+                       walltime_accounting_get_nsteps_done(walltime_accounting),
+                       delta_t, nbfs, mflop);
          }
      }
  }
@@ -2644,9 +2808,10 @@ void init_md(FILE *fplog,
               t_nrnb *nrnb, gmx_mtop_t *mtop,
               gmx_update_t *upd,
               int nfile, const t_filenm fnm[],
-             gmx_mdoutf_t **outf, t_mdebin **mdebin,
+             gmx_mdoutf_t *outf, t_mdebin **mdebin,
               tensor force_vir, tensor shake_vir, rvec mu_tot,
-             gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags)
+             gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
+             gmx_wallcycle_t wcycle)
  {
      int  i, j, n;
      real tmpt, mod;
@@ -2692,16 +2857,20 @@ void init_md(FILE *fplog,
          {
              please_cite(fplog, "Bussi2007a");
          }
+        if (ir->eI == eiSD1)
+        {
+            please_cite(fplog, "Goga2012");
+        }
      }
  
      init_nrnb(nrnb);
  
      if (nfile != -1)
      {
-        *outf = init_mdoutf(nfile, fnm, Flags, cr, ir, oenv);
+        *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, ir, mtop, oenv, wcycle);
  
-        *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : (*outf)->fp_ene,
-                              mtop, ir, (*outf)->fp_dhdl);
+        *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : mdoutf_get_fp_ene(*outf),
+                              mtop, ir, mdoutf_get_fp_dhdl(*outf));
      }
  
      if (ir->bAdress)