Added GPU local wait to load balancing

[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.c
diff --git a/src/gromacs/mdlib/sim_util.c b/src/gromacs/mdlib/sim_util.c

index 1251f0c4e253cff3e5b39d434c2337c8e737c684..a6b166ccfd5acedb51742f9063d40207edda4e23 100644 (file)
--- a/src/gromacs/mdlib/sim_util.c
+++ b/src/gromacs/mdlib/sim_util.c
@@ -38,14 +38,17 @@
  #include <config.h>
  #endif
  
+#include <assert.h>
+#include <math.h>
  #include <stdio.h>
+#include <string.h>
  #ifdef HAVE_SYS_TIME_H
  #include <sys/time.h>
  #endif
-#include <math.h>
+
  #include "typedefs.h"
-#include "string2.h"
-#include "smalloc.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/smalloc.h"
  #include "names.h"
  #include "txtdump.h"
  #include "pbc.h"
@@ -69,7 +72,6 @@
  #include "constr.h"
  #include "xvgr.h"
  #include "copyrite.h"
-#include "gromacs/random/random.h"
  #include "domdec.h"
  #include "genborn.h"
  #include "nbnxn_atomdata.h"
@@ -78,6 +80,9 @@
  #include "nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn.h"
  #include "nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn.h"
  #include "nbnxn_kernels/nbnxn_kernel_gpu_ref.h"
+#include "nonbonded.h"
+#include "../gmxlib/nonbonded/nb_kernel.h"
+#include "../gmxlib/nonbonded/nb_free_energy.h"
  
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/timing/walltime_accounting.h"
@@ -85,10 +90,12 @@
  #include "gromacs/essentialdynamics/edsam.h"
  #include "gromacs/pulling/pull.h"
  #include "gromacs/pulling/pull_rotation.h"
-
+#include "gromacs/imd/imd.h"
  #include "adress.h"
  #include "qmmm.h"
  
+#include "gmx_omp_nthreads.h"
+
  #include "nbnxn_cuda_data_mgmt.h"
  #include "nbnxn_cuda/nbnxn_cuda.h"
  
@@ -149,33 +156,29 @@ void print_time(FILE                     *out,
  }
  
  void print_date_and_time(FILE *fplog, int nodeid, const char *title,
-                         const gmx_walltime_accounting_t walltime_accounting)
+                         double the_time)
  {
-    int    i;
-    char   timebuf[STRLEN];
      char   time_string[STRLEN];
-    time_t tmptime;
  
-    if (fplog)
+    if (!fplog)
      {
-        if (walltime_accounting != NULL)
-        {
-            tmptime = (time_t) walltime_accounting_get_start_time_stamp(walltime_accounting);
-            gmx_ctime_r(&tmptime, timebuf, STRLEN);
-        }
-        else
-        {
-            tmptime = (time_t) gmx_gettime();
-            gmx_ctime_r(&tmptime, timebuf, STRLEN);
-        }
+        return;
+    }
+
+    {
+        int    i;
+        char   timebuf[STRLEN];
+        time_t temp_time = (time_t) the_time;
+
+        gmx_ctime_r(&temp_time, timebuf, STRLEN);
          for (i = 0; timebuf[i] >= ' '; i++)
          {
              time_string[i] = timebuf[i];
          }
          time_string[i] = '\0';
-
-        fprintf(fplog, "%s on node %d %s\n", title, nodeid, time_string);
      }
+
+    fprintf(fplog, "%s on rank %d %s\n", title, nodeid, time_string);
  }
  
  void print_start(FILE *fplog, t_commrec *cr,
@@ -185,7 +188,8 @@ void print_start(FILE *fplog, t_commrec *cr,
      char buf[STRLEN];
  
      sprintf(buf, "Started %s", name);
-    print_date_and_time(fplog, cr->nodeid, buf, walltime_accounting);
+    print_date_and_time(fplog, cr->nodeid, buf,
+                        walltime_accounting_get_start_time_stamp(walltime_accounting));
  }
  
  static void sum_forces(int start, int end, rvec f[], rvec flr[])
@@ -392,7 +396,8 @@ static void pull_potential_wrapper(FILE *fplog,
                                     t_mdatoms *mdatoms,
                                     gmx_enerdata_t *enerd,
                                     real *lambda,
-                                   double t)
+                                   double t,
+                                   gmx_wallcycle_t wcycle)
  {
      t_pbc  pbc;
      real   dvdl;
@@ -402,6 +407,7 @@ static void pull_potential_wrapper(FILE *fplog,
       * The virial contribution is calculated directly,
       * which is why we call pull_potential after calc_virial.
       */
+    wallcycle_start(wcycle, ewcPULLPOT);
      set_pbc(&pbc, ir->ePBC, box);
      dvdl                     = 0;
      enerd->term[F_COM_PULL] +=
@@ -412,6 +418,7 @@ static void pull_potential_wrapper(FILE *fplog,
          gmx_print_sepdvdl(fplog, "Com pull", enerd->term[F_COM_PULL], dvdl);
      }
      enerd->dvdl_lin[efptRESTRAINT] += dvdl;
+    wallcycle_stop(wcycle, ewcPULLPOT);
  }
  
  static void pme_receive_force_ener(FILE           *fplog,
@@ -670,17 +677,131 @@ static void do_nb_verlet(t_forcerec *fr,
      if (ic->vdw_modifier == eintmodFORCESWITCH)
      {
          /* We add up the switch cost separately */
-        inc_nrnb(nrnb, eNR_NBNXN_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
+        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
                   nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
      }
      if (ic->vdw_modifier == eintmodPOTSWITCH)
      {
          /* We add up the switch cost separately */
-        inc_nrnb(nrnb, eNR_NBNXN_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
+        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
+                 nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
+    }
+    if (ic->vdwtype == evdwPME)
+    {
+        /* We add up the LJ Ewald cost separately */
+        inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD+((flags & GMX_FORCE_ENERGY) ? 1 : 0),
                   nbvg->nbl_lists.natpair_ljq + nbvg->nbl_lists.natpair_lj);
      }
  }
  
+static void do_nb_verlet_fep(nbnxn_pairlist_set_t *nbl_lists,
+                             t_forcerec           *fr,
+                             rvec                  x[],
+                             rvec                  f[],
+                             t_mdatoms            *mdatoms,
+                             t_lambda             *fepvals,
+                             real                 *lambda,
+                             gmx_enerdata_t       *enerd,
+                             int                   flags,
+                             t_nrnb               *nrnb,
+                             gmx_wallcycle_t       wcycle)
+{
+    int              donb_flags;
+    nb_kernel_data_t kernel_data;
+    real             lam_i[efptNR];
+    real             dvdl_nb[efptNR];
+    int              th;
+    int              i, j;
+
+    donb_flags = 0;
+    /* Add short-range interactions */
+    donb_flags |= GMX_NONBONDED_DO_SR;
+
+    /* Currently all group scheme kernels always calculate (shift-)forces */
+    if (flags & GMX_FORCE_FORCES)
+    {
+        donb_flags |= GMX_NONBONDED_DO_FORCE;
+    }
+    if (flags & GMX_FORCE_VIRIAL)
+    {
+        donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
+    }
+    if (flags & GMX_FORCE_ENERGY)
+    {
+        donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
+    }
+    if (flags & GMX_FORCE_DO_LR)
+    {
+        donb_flags |= GMX_NONBONDED_DO_LR;
+    }
+
+    kernel_data.flags  = donb_flags;
+    kernel_data.lambda = lambda;
+    kernel_data.dvdl   = dvdl_nb;
+
+    kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR];
+    kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR];
+
+    /* reset free energy components */
+    for (i = 0; i < efptNR; i++)
+    {
+        dvdl_nb[i]  = 0;
+    }
+
+    assert(gmx_omp_nthreads_get(emntNonbonded) == nbl_lists->nnbl);
+
+    wallcycle_sub_start(wcycle, ewcsNONBONDED);
+#pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
+    for (th = 0; th < nbl_lists->nnbl; th++)
+    {
+        gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
+                                  x, f, fr, mdatoms, &kernel_data, nrnb);
+    }
+
+    if (fepvals->sc_alpha != 0)
+    {
+        enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
+        enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+    else
+    {
+        enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
+        enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
+    }
+
+    /* If we do foreign lambda and we have soft-core interactions
+     * we have to recalculate the (non-linear) energies contributions.
+     */
+    if (fepvals->n_lambda > 0 && (flags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
+    {
+        kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
+        kernel_data.lambda         = lam_i;
+        kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR];
+        kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR];
+        /* Note that we add to kernel_data.dvdl, but ignore the result */
+
+        for (i = 0; i < enerd->n_lambda; i++)
+        {
+            for (j = 0; j < efptNR; j++)
+            {
+                lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
+            }
+            reset_foreign_enerdata(enerd);
+#pragma omp parallel for schedule(static) num_threads(nbl_lists->nnbl)
+            for (th = 0; th < nbl_lists->nnbl; th++)
+            {
+                gmx_nb_free_energy_kernel(nbl_lists->nbl_fep[th],
+                                          x, f, fr, mdatoms, &kernel_data, nrnb);
+            }
+
+            sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
+            enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
+        }
+    }
+
+    wallcycle_sub_stop(wcycle, ewcsNONBONDED);
+}
+
  void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                           t_inputrec *inputrec,
                           gmx_int64_t step, t_nrnb *nrnb, gmx_wallcycle_t wcycle,
@@ -814,10 +935,6 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          if (EVDW_PME(fr->vdwtype))
          {
              pme_flags |= GMX_PME_DO_LJ;
-            if (fr->ljpme_combination_rule == eljpmeLB)
-            {
-                pme_flags |= GMX_PME_LJ_LB;
-            }
          }
  
          gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
@@ -1147,6 +1264,29 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
                       nrnb, wcycle);
      }
  
+    if (fr->efep != efepNO)
+    {
+        /* Calculate the local and non-local free energy interactions here.
+         * Happens here on the CPU both with and without GPU.
+         */
+        if (fr->nbv->grp[eintLocal].nbl_lists.nbl_fep[0]->nrj > 0)
+        {
+            do_nb_verlet_fep(&fr->nbv->grp[eintLocal].nbl_lists,
+                             fr, x, f, mdatoms,
+                             inputrec->fepvals, lambda,
+                             enerd, flags, nrnb, wcycle);
+        }
+
+        if (DOMAINDECOMP(cr) &&
+            fr->nbv->grp[eintNonlocal].nbl_lists.nbl_fep[0]->nrj > 0)
+        {
+            do_nb_verlet_fep(&fr->nbv->grp[eintNonlocal].nbl_lists,
+                             fr, x, f, mdatoms,
+                             inputrec->fepvals, lambda,
+                             enerd, flags, nrnb, wcycle);
+        }
+    }
+
      if (!bUseOrEmulGPU || bDiffKernels)
      {
          int aloc;
@@ -1273,6 +1413,16 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
      if (bDoForces && DOMAINDECOMP(cr))
      {
+        if (bUseGPU)
+        {
+            /* We are done with the CPU compute, but the GPU local non-bonded
+             * kernel can still be running while we communicate the forces.
+             * We start a counter here, so we can, hopefully, time the rest
+             * of the GPU kernel execution and data transfer.
+             */
+            wallcycle_start(wcycle, ewcWAIT_GPU_NB_L_EST);
+        }
+
          /* Communicate the forces */
          wallcycle_start(wcycle, ewcMOVEF);
          dd_move_f(cr->dd, f, fr->fshift);
@@ -1303,13 +1453,44 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          /* wait for local forces (or calculate in emulation mode) */
          if (bUseGPU)
          {
+            float       cycles_tmp, cycles_wait_est;
+            const float cuda_api_overhead_margin = 50000.0f; /* cycles */
+
              wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
              nbnxn_cuda_wait_gpu(nbv->cu_nbv,
                                  nbv->grp[eintLocal].nbat,
                                  flags, eatLocal,
                                  enerd->grpp.ener[egLJSR], enerd->grpp.ener[egCOULSR],
                                  fr->fshift);
-            cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+            cycles_tmp      = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
+            if (bDoForces && DOMAINDECOMP(cr))
+            {
+                cycles_wait_est = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L_EST);
+
+                if (cycles_tmp < cuda_api_overhead_margin)
+                {
+                    /* We measured few cycles, it could be that the kernel
+                     * and transfer finished earlier and there was no actual
+                     * wait time, only API call overhead.
+                     * Then the actual time could be anywhere between 0 and
+                     * cycles_wait_est. As a compromise, we use half the time.
+                     */
+                    cycles_wait_est *= 0.5f;
+                }
+            }
+            else
+            {
+                /* No force communication so we actually timed the wait */
+                cycles_wait_est = cycles_tmp;
+            }
+            /* Even though this is after dd_move_f, the actual task we are
+             * waiting for runs asynchronously with dd_move_f and we usually
+             * have nothing to balance it with, so we can and should add
+             * the time to the force time for load balancing.
+             */
+            cycles_force    += cycles_wait_est;
+            cycles_wait_gpu += cycles_wait_est;
  
              /* now clear the GPU outputs while we finish the step on the CPU */
  
@@ -1390,8 +1571,12 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
  
      if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
      {
+        /* Since the COM pulling is always done mass-weighted, no forces are
+         * applied to vsites and this call can be done after vsite spreading.
+         */
          pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
-                               f, vir_force, mdatoms, enerd, lambda, t);
+                               f, vir_force, mdatoms, enerd, lambda, t,
+                               wcycle);
      }
  
      /* Add the forces from enforced rotation potentials (if any) */
@@ -1402,6 +1587,9 @@ void do_force_cutsVERLET(FILE *fplog, t_commrec *cr,
          wallcycle_stop(wcycle, ewcROTadd);
      }
  
+    /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
+    IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
+
      if (PAR(cr) && !(cr->duty & DUTY_PME))
      {
          /* In case of node-splitting, the PP nodes receive the long-range
@@ -1561,10 +1749,6 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
          if (EVDW_PME(fr->vdwtype))
          {
              pme_flags |= GMX_PME_DO_LJ;
-            if (fr->ljpme_combination_rule == eljpmeLB)
-            {
-                pme_flags |= GMX_PME_LJ_LB;
-            }
          }
  
          gmx_pme_send_coordinates(cr, bBS ? boxs : box, x,
@@ -1866,7 +2050,8 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
      if (inputrec->ePull == epullUMBRELLA || inputrec->ePull == epullCONST_F)
      {
          pull_potential_wrapper(fplog, bSepDVDL, cr, inputrec, box, x,
-                               f, vir_force, mdatoms, enerd, lambda, t);
+                               f, vir_force, mdatoms, enerd, lambda, t,
+                               wcycle);
      }
  
      /* Add the forces from enforced rotation potentials (if any) */
@@ -1877,6 +2062,9 @@ void do_force_cutsGROUP(FILE *fplog, t_commrec *cr,
          wallcycle_stop(wcycle, ewcROTadd);
      }
  
+    /* Add forces from interactive molecular dynamics (IMD), if bIMD == TRUE. */
+    IMD_apply_forces(inputrec->bIMD, inputrec->imd, cr, f, wcycle);
+
      if (PAR(cr) && !(cr->duty & DUTY_PME))
      {
          /* In case of node-splitting, the PP nodes receive the long-range
@@ -1991,7 +2179,7 @@ void do_constrain_first(FILE *fplog, gmx_constr_t constr,
  
      /* constrain the current position */
      constrain(NULL, TRUE, FALSE, constr, &(top->idef),
-              ir, NULL, cr, step, 0, md,
+              ir, NULL, cr, step, 0, 1.0, md,
                state->x, state->x, NULL,
                fr->bMolPBC, state->box,
                state->lambda[efptBONDED], &dvdl_dum,
@@ -2003,7 +2191,7 @@ void do_constrain_first(FILE *fplog, gmx_constr_t constr,
          /* also may be useful if we need the ekin from the halfstep for velocity verlet */
          /* might not yet treat veta correctly */
          constrain(NULL, TRUE, FALSE, constr, &(top->idef),
-                  ir, NULL, cr, step, 0, md,
+                  ir, NULL, cr, step, 0, 1.0, md,
                    state->x, state->v, state->v,
                    fr->bMolPBC, state->box,
                    state->lambda[efptBONDED], &dvdl_dum,
@@ -2034,7 +2222,7 @@ void do_constrain_first(FILE *fplog, gmx_constr_t constr,
          }
          dvdl_dum = 0;
          constrain(NULL, TRUE, FALSE, constr, &(top->idef),
-                  ir, NULL, cr, step, -1, md,
+                  ir, NULL, cr, step, -1, 1.0, md,
                    state->x, savex, NULL,
                    fr->bMolPBC, state->box,
                    state->lambda[efptBONDED], &dvdl_dum,
@@ -2123,11 +2311,11 @@ integrate_table(real vdwtab[], real scale, int offstart, int rstart, int rend,
  
  void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
  {
-    double eners[2], virs[2], enersum, virsum, y0, f, g, h;
-    double r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
-    double invscale, invscale2, invscale3;
-    int    ri0, ri1, ri, i, offstart, offset;
-    real   scale, *vdwtab, tabfactor, tmp;
+    double   eners[2], virs[2], enersum, virsum, y0, f, g, h;
+    double   r0, r1, r, rc3, rc9, ea, eb, ec, pa, pb, pc, pd;
+    double   invscale, invscale2, invscale3;
+    int      ri0, ri1, ri, i, offstart, offset;
+    real     scale, *vdwtab, tabfactor, tmp;
  
      fr->enershiftsix    = 0;
      fr->enershifttwelve = 0;
@@ -2143,30 +2331,53 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
              eners[i] = 0;
              virs[i]  = 0;
          }
-        if (fr->vdwtype == evdwSWITCH || fr->vdwtype == evdwSHIFT ||
-            fr->vdw_modifier == eintmodPOTSWITCH ||
-            fr->vdw_modifier == eintmodFORCESWITCH)
+        if ((fr->vdw_modifier == eintmodPOTSHIFT) ||
+            (fr->vdw_modifier == eintmodPOTSWITCH) ||
+            (fr->vdw_modifier == eintmodFORCESWITCH) ||
+            (fr->vdwtype == evdwSHIFT) ||
+            (fr->vdwtype == evdwSWITCH))
          {
-            if (fr->rvdw_switch == 0)
+            if (((fr->vdw_modifier == eintmodPOTSWITCH) ||
+                 (fr->vdw_modifier == eintmodFORCESWITCH) ||
+                 (fr->vdwtype == evdwSWITCH)) && fr->rvdw_switch == 0)
              {
                  gmx_fatal(FARGS,
                            "With dispersion correction rvdw-switch can not be zero "
                            "for vdw-type = %s", evdw_names[fr->vdwtype]);
              }
  
-            scale  = fr->nblists[0].table_elec_vdw.scale;
+            scale  = fr->nblists[0].table_vdw.scale;
              vdwtab = fr->nblists[0].table_vdw.data;
  
              /* Round the cut-offs to exact table values for precision */
              ri0  = floor(fr->rvdw_switch*scale);
              ri1  = ceil(fr->rvdw*scale);
+
+            /* The code below has some support for handling force-switching, i.e.
+             * when the force (instead of potential) is switched over a limited
+             * region. This leads to a constant shift in the potential inside the
+             * switching region, which we can handle by adding a constant energy
+             * term in the force-switch case just like when we do potential-shift.
+             *
+             * For now this is not enabled, but to keep the functionality in the
+             * code we check separately for switch and shift. When we do force-switch
+             * the shifting point is rvdw_switch, while it is the cutoff when we
+             * have a classical potential-shift.
+             *
+             * For a pure potential-shift the potential has a constant shift
+             * all the way out to the cutoff, and that is it. For other forms
+             * we need to calculate the constant shift up to the point where we
+             * start modifying the potential.
+             */
+            ri0  = (fr->vdw_modifier == eintmodPOTSHIFT) ? ri1 : ri0;
+
              r0   = ri0/scale;
              r1   = ri1/scale;
              rc3  = r0*r0*r0;
              rc9  = rc3*rc3*rc3;
  
-            if (fr->vdwtype == evdwSHIFT ||
-                fr->vdw_modifier == eintmodFORCESWITCH)
+            if ((fr->vdw_modifier == eintmodFORCESWITCH) ||
+                (fr->vdwtype == evdwSHIFT))
              {
                  /* Determine the constant energy shift below rvdw_switch.
                   * Table has a scale factor since we have scaled it down to compensate
@@ -2175,6 +2386,12 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
                  fr->enershiftsix    = (real)(-1.0/(rc3*rc3)) - 6.0*vdwtab[8*ri0];
                  fr->enershifttwelve = (real)( 1.0/(rc9*rc3)) - 12.0*vdwtab[8*ri0 + 4];
              }
+            else if (fr->vdw_modifier == eintmodPOTSHIFT)
+            {
+                fr->enershiftsix    = (real)(-1.0/(rc3*rc3));
+                fr->enershifttwelve = (real)( 1.0/(rc9*rc3));
+            }
+
              /* Add the constant part from 0 to rvdw_switch.
               * This integration from 0 to rvdw_switch overcounts the number
               * of interactions by 1, as it also counts the self interaction.
@@ -2182,6 +2399,11 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
               */
              eners[0] += 4.0*M_PI*fr->enershiftsix*rc3/3.0;
              eners[1] += 4.0*M_PI*fr->enershifttwelve*rc3/3.0;
+
+            /* Calculate the contribution in the range [r0,r1] where we
+             * modify the potential. For a pure potential-shift modifier we will
+             * have ri0==ri1, and there will not be any contribution here.
+             */
              for (i = 0; i < 2; i++)
              {
                  enersum = 0;
@@ -2191,46 +2413,36 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
                  virs[i]  -= virsum;
              }
  
-            /* now add the correction for rvdw_switch to infinity */
+            /* Alright: Above we compensated by REMOVING the parts outside r0
+             * corresponding to the ideal VdW 1/r6 and /r12 potentials.
+             *
+             * Regardless of whether r0 is the point where we start switching,
+             * or the cutoff where we calculated the constant shift, we include
+             * all the parts we are missing out to infinity from r0 by
+             * calculating the analytical dispersion correction.
+             */
              eners[0] += -4.0*M_PI/(3.0*rc3);
              eners[1] +=  4.0*M_PI/(9.0*rc9);
              virs[0]  +=  8.0*M_PI/rc3;
              virs[1]  += -16.0*M_PI/(3.0*rc9);
          }
-        else if (EVDW_PME(fr->vdwtype))
-        {
-            scale  = fr->nblists[0].table_vdw.scale;
-            vdwtab = fr->nblists[0].table_vdw.data;
-
-            ri0  = floor(fr->rvdw_switch*scale);
-            ri1  = ceil(fr->rvdw*scale);
-            r0   = ri0/scale;
-            r1   = ri1/scale;
-            rc3  = r0*r0*r0;
-            rc9  = rc3*rc3*rc3;
-
-            /* Calculate self-interaction coefficient (assuming that
-             * the reciprocal-space contribution is constant in the
-             * region that contributes to the self-interaction).
-             */
-            fr->enershiftsix = pow(fr->ewaldcoeff_lj, 6) / 6.0;
-
-            /* Add analytical corrections, C6 for the whole range, C12
-             * from rvdw_switch to infinity.
-             */
-
-            eners[0] += -pow(sqrt(M_PI)*fr->ewaldcoeff_lj, 3)/3.0;
-            eners[1] +=  4.0*M_PI/(9.0*rc9);
-            virs[0]  +=  pow(sqrt(M_PI)*fr->ewaldcoeff_lj, 3);
-            virs[1]  += -16.0*M_PI/(3.0*rc9);
-        }
-        else if ((fr->vdwtype == evdwCUT) || (fr->vdwtype == evdwUSER))
+        else if (fr->vdwtype == evdwCUT ||
+                 EVDW_PME(fr->vdwtype) ||
+                 fr->vdwtype == evdwUSER)
          {
              if (fr->vdwtype == evdwUSER && fplog)
              {
                  fprintf(fplog,
                          "WARNING: using dispersion correction with user tables\n");
              }
+
+            /* Note that with LJ-PME, the dispersion correction is multiplied
+             * by the difference between the actual C6 and the value of C6
+             * that would produce the combination rule.
+             * This means the normal energy and virial difference formulas
+             * can be used here.
+             */
+
              rc3  = fr->rvdw*fr->rvdw*fr->rvdw;
              rc9  = rc3*rc3*rc3;
              /* Contribution beyond the cut-off */
@@ -2252,6 +2464,20 @@ void calc_enervirdiff(FILE *fplog, int eDispCorr, t_forcerec *fr)
                        "Dispersion correction is not implemented for vdw-type = %s",
                        evdw_names[fr->vdwtype]);
          }
+
+        /* When we deprecate the group kernels the code below can go too */
+        if (fr->vdwtype == evdwPME && fr->cutoff_scheme == ecutsGROUP)
+        {
+            /* Calculate self-interaction coefficient (assuming that
+             * the reciprocal-space contribution is constant in the
+             * region that contributes to the self-interaction).
+             */
+            fr->enershiftsix = pow(fr->ewaldcoeff_lj, 6) / 6.0;
+
+            eners[0] += -pow(sqrt(M_PI)*fr->ewaldcoeff_lj, 3)/3.0;
+            virs[0]  +=  pow(sqrt(M_PI)*fr->ewaldcoeff_lj, 3);
+        }
+
          fr->enerdiffsix    = eners[0];
          fr->enerdifftwelve = eners[1];
          /* The 0.5 is due to the Gromacs definition of the virial */
@@ -2645,7 +2871,8 @@ void init_md(FILE *fplog,
               int nfile, const t_filenm fnm[],
               gmx_mdoutf_t *outf, t_mdebin **mdebin,
               tensor force_vir, tensor shake_vir, rvec mu_tot,
-             gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags)
+             gmx_bool *bSimAnn, t_vcm **vcm, unsigned long Flags,
+             gmx_wallcycle_t wcycle)
  {
      int  i, j, n;
      real tmpt, mod;
@@ -2691,13 +2918,17 @@ void init_md(FILE *fplog,
          {
              please_cite(fplog, "Bussi2007a");
          }
+        if (ir->eI == eiSD1)
+        {
+            please_cite(fplog, "Goga2012");
+        }
      }
  
      init_nrnb(nrnb);
  
      if (nfile != -1)
      {
-        *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, ir, mtop, oenv);
+        *outf = init_mdoutf(fplog, nfile, fnm, Flags, cr, ir, mtop, oenv, wcycle);
  
          *mdebin = init_mdebin((Flags & MD_APPENDFILES) ? NULL : mdoutf_get_fp_ene(*outf),
                                mtop, ir, mdoutf_get_fp_dhdl(*outf));