Merge branch release-2016

[alexxy/gromacs.git] / src / gromacs / mdlib / forcerec.cpp
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp

index 7c3a56e8133fb23a1148c1f2fdbb186376bf84e2..12f4e3cacbf33ff34d00dae7c59f4574c6c64114 100644 (file)
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -53,7 +53,6 @@
  #include "gromacs/domdec/domdec_struct.h"
  #include "gromacs/ewald/ewald.h"
  #include "gromacs/fileio/filetypes.h"
-#include "gromacs/gmxlib/md_logging.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  #include "gromacs/gpu_utils/gpu_utils.h"
@@ -74,6 +73,7 @@
  #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
  #include "gromacs/mdlib/nbnxn_search.h"
  #include "gromacs/mdlib/nbnxn_simd.h"
+#include "gromacs/mdlib/nbnxn_tuning.h"
  #include "gromacs/mdlib/nbnxn_util.h"
  #include "gromacs/mdlib/ns.h"
  #include "gromacs/mdlib/qmmm.h"
@@ -81,6 +81,7 @@
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/fcdata.h"
  #include "gromacs/mdtypes/group.h"
+#include "gromacs/mdtypes/iforceprovider.h"
  #include "gromacs/mdtypes/inputrec.h"
  #include "gromacs/mdtypes/md_enums.h"
  #include "gromacs/pbcutil/ishift.h"
@@ -93,15 +94,16 @@
  #include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/logger.h"
  #include "gromacs/utility/pleasecite.h"
  #include "gromacs/utility/smalloc.h"
-#include "gromacs/utility/stringutil.h"
+#include "gromacs/utility/strconvert.h"
  
  #include "nbnxn_gpu_jit_support.h"
  
  const char *egrp_nm[egNR+1] = {
      "Coul-SR", "LJ-SR", "Buck-SR",
-    "Coul-14", "LJ-14", NULL
+    "Coul-14", "LJ-14", nullptr
  };
  
  t_forcerec *mk_forcerec(void)
@@ -349,7 +351,7 @@ check_solvent_cg(const gmx_moltype_t    *molt,
  
      /* Check if we are doing QM on this group */
      qm = FALSE;
-    if (qm_grpnr != NULL)
+    if (qm_grpnr != nullptr)
      {
          for (j = j0; j < j1 && !qm; j++)
          {
@@ -543,7 +545,7 @@ check_solvent(FILE  *                fp,
      }
  
      n_solvent_parameters = 0;
-    solvent_parameters   = NULL;
+    solvent_parameters   = nullptr;
      /* Allocate temporary array for solvent type */
      snew(cg_sp, mtop->nmolblock);
  
@@ -566,7 +568,7 @@ check_solvent(FILE  *                fp,
              {
                  check_solvent_cg(molt, cg_mol, nmol,
                                   mtop->groups.grpnr[egcQMMM] ?
-                                 mtop->groups.grpnr[egcQMMM]+at_offset+am : 0,
+                                 mtop->groups.grpnr[egcQMMM]+at_offset+am : nullptr,
                                   &mtop->groups.grps[egcQMMM],
                                   fr,
                                   &n_solvent_parameters, &solvent_parameters,
@@ -622,7 +624,7 @@ check_solvent(FILE  *                fp,
      }
      sfree(cg_sp);
  
-    if (bestsol != esolNO && fp != NULL)
+    if (bestsol != esolNO && fp != nullptr)
      {
          fprintf(fp, "\nEnabling %s-like water optimization for %d molecules.\n\n",
                  esol_names[bestsol],
@@ -702,7 +704,7 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
                  {
                      bId = FALSE;
                  }
-                if (mtop->groups.grpnr[egcQMMM] != NULL)
+                if (mtop->groups.grpnr[egcQMMM] != nullptr)
                  {
                      for (ai = a0; ai < a1; ai++)
                      {
@@ -991,7 +993,7 @@ void update_forcerec(t_forcerec *fr, matrix box)
  {
      if (fr->eeltype == eelGRF)
      {
-        calc_rffac(NULL, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
+        calc_rffac(nullptr, fr->eeltype, fr->epsilon_r, fr->epsilon_rf,
                     fr->rcoulomb, fr->temp, fr->zsquare, box,
                     &fr->kappa, &fr->k_rf, &fr->c_rf);
      }
@@ -1007,7 +1009,7 @@ void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
      int             ntp, *typecount;
      gmx_bool        bBHAM;
      real           *nbfp;
-    real           *nbfp_comb = NULL;
+    real           *nbfp_comb = nullptr;
  
      ntp   = fr->ntype;
      bBHAM = fr->bBHAM;
@@ -1214,7 +1216,7 @@ void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
          sfree(nbfp_comb);
      }
  
-    if (fplog != NULL)
+    if (fplog != nullptr)
      {
          if (fr->eDispCorr == edispcAllEner ||
              fr->eDispCorr == edispcAllEnerPres)
@@ -1296,7 +1298,7 @@ static void make_nbf_tables(FILE *fp,
      char buf[STRLEN];
      int  i, j;
  
-    if (tabfn == NULL)
+    if (tabfn == nullptr)
      {
          if (debug)
          {
@@ -1433,10 +1435,10 @@ static bondedtable_t *make_bonded_tables(FILE *fplog,
      int            ncount, *count;
      bondedtable_t *tab;
  
-    tab = NULL;
+    tab = nullptr;
  
      ncount = 0;
-    count  = NULL;
+    count  = nullptr;
      count_tables(ftype1, ftype2, mtop, &ncount, &count);
  
      // Are there any relevant tabulated bond interactions?
@@ -1504,16 +1506,8 @@ void forcerec_set_ranges(t_forcerec *fr,
  
      if (fr->bF_NoVirSum)
      {
-        fr->f_novirsum_n = natoms_f_novirsum;
-        if (fr->f_novirsum_n > fr->f_novirsum_nalloc)
-        {
-            fr->f_novirsum_nalloc = over_alloc_dd(fr->f_novirsum_n);
-            srenew(fr->f_novirsum_alloc, fr->f_novirsum_nalloc);
-        }
-    }
-    else
-    {
-        fr->f_novirsum_n = 0;
+        /* TODO: remove this + 1 when padding is properly implemented */
+        fr->forceBufferNoVirialSummation->resize(natoms_f_novirsum + 1);
      }
  }
  
@@ -1544,7 +1538,7 @@ gmx_bool can_use_allvsall(const t_inputrec *ir, gmx_bool bPrintNote, t_commrec *
               (ir->implicit_solvent == eisGBSA && (ir->gb_algorithm == egbSTILL ||
                                                    ir->gb_algorithm == egbHCT   ||
                                                    ir->gb_algorithm == egbOBC))) &&
-            getenv("GMX_NO_ALLVSALL") == NULL
+            getenv("GMX_NO_ALLVSALL") == nullptr
          );
  
      if (bAllvsAll && ir->opts.ngener > 1)
@@ -1553,11 +1547,7 @@ gmx_bool can_use_allvsall(const t_inputrec *ir, gmx_bool bPrintNote, t_commrec *
  
          if (bPrintNote)
          {
-            if (MASTER(cr))
-            {
-                fprintf(stderr, "\n%s\n", note);
-            }
-            if (fp != NULL)
+            if (fp != nullptr)
              {
                  fprintf(fp, "\n%s\n", note);
              }
@@ -1574,38 +1564,15 @@ gmx_bool can_use_allvsall(const t_inputrec *ir, gmx_bool bPrintNote, t_commrec *
  }
  
  
-gmx_bool nbnxn_gpu_acceleration_supported(FILE             *fplog,
-                                          const t_commrec  *cr,
-                                          const t_inputrec *ir,
-                                          gmx_bool          bRerunMD)
-{
-    if (bRerunMD && ir->opts.ngener > 1)
-    {
-        /* Rerun execution time is dominated by I/O and pair search,
-         * so GPUs are not very useful, plus they do not support more
-         * than one energy group. If the user requested GPUs
-         * explicitly, a fatal error is given later.  With non-reruns,
-         * we fall back to a single whole-of system energy group
-         * (which runs much faster than a multiple-energy-groups
-         * implementation would), and issue a note in the .log
-         * file. Users can re-run if they want the information. */
-        md_print_warn(cr, fplog, "Rerun with energy groups is not implemented for GPUs, falling back to the CPU\n");
-        return FALSE;
-    }
-
-    return TRUE;
-}
-
-gmx_bool nbnxn_simd_supported(FILE             *fplog,
-                              const t_commrec  *cr,
-                              const t_inputrec *ir)
+gmx_bool nbnxn_simd_supported(const gmx::MDLogger &mdlog,
+                              const t_inputrec    *ir)
  {
      if (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB)
      {
          /* LJ PME with LB combination rule does 7 mesh operations.
           * This so slow that we don't compile SIMD non-bonded kernels
           * for that. */
-        md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with SIMD kernels, falling back to plain C kernels\n");
+        GMX_LOG(mdlog.warning).asParagraph().appendText("LJ-PME with Lorentz-Berthelot is not supported with SIMD kernels, falling back to plain C kernels");
          return FALSE;
      }
  
@@ -1661,7 +1628,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
  #endif  /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
  
  
-        if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
+        if (getenv("GMX_NBNXN_SIMD_4XN") != nullptr)
          {
  #ifdef GMX_NBNXN_SIMD_4XN
              *kernel_type = nbnxnk4xN_SIMD_4xN;
@@ -1669,7 +1636,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
              gmx_fatal(FARGS, "SIMD 4xN kernels requested, but GROMACS has been compiled without support for these kernels");
  #endif
          }
-        if (getenv("GMX_NBNXN_SIMD_2XNN") != NULL)
+        if (getenv("GMX_NBNXN_SIMD_2XNN") != nullptr)
          {
  #ifdef GMX_NBNXN_SIMD_2XNN
              *kernel_type = nbnxnk4xN_SIMD_2xNN;
@@ -1685,16 +1652,17 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
           * With FMA analytical is sometimes faster for a width if 4 as well.
           * On BlueGene/Q, this is faster regardless of precision.
           * In single precision, this is faster on Bulldozer.
+         * On Skylake table is faster in single and double. TODO: Test 5xxx series.
           */
-#if GMX_SIMD_REAL_WIDTH >= 8 || \
-        (GMX_SIMD_REAL_WIDTH >= 4 && GMX_SIMD_HAVE_FMA && !GMX_DOUBLE) || GMX_SIMD_IBM_QPX
+#if ((GMX_SIMD_REAL_WIDTH >= 8 || (GMX_SIMD_REAL_WIDTH >= 4 && GMX_SIMD_HAVE_FMA && !GMX_DOUBLE)) \
+        && !GMX_SIMD_X86_AVX_512) || GMX_SIMD_IBM_QPX
          *ewald_excl = ewaldexclAnalytical;
  #endif
-        if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
+        if (getenv("GMX_NBNXN_EWALD_TABLE") != nullptr)
          {
              *ewald_excl = ewaldexclTable;
          }
-        if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != NULL)
+        if (getenv("GMX_NBNXN_EWALD_ANALYTICAL") != nullptr)
          {
              *ewald_excl = ewaldexclAnalytical;
          }
@@ -1706,7 +1674,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
  
  const char *lookup_nbnxn_kernel_name(int kernel_type)
  {
-    const char *returnvalue = NULL;
+    const char *returnvalue = nullptr;
      switch (kernel_type)
      {
          case nbnxnkNotSet:
@@ -1729,17 +1697,17 @@ const char *lookup_nbnxn_kernel_name(int kernel_type)
          case nbnxnkNR:
          default:
              gmx_fatal(FARGS, "Illegal kernel type selected");
-            returnvalue = NULL;
+            returnvalue = nullptr;
              break;
      }
      return returnvalue;
  };
  
  static void pick_nbnxn_kernel(FILE                *fp,
-                              const t_commrec     *cr,
+                              const gmx::MDLogger &mdlog,
                                gmx_bool             use_simd_kernels,
                                gmx_bool             bUseGPU,
-                              gmx_bool             bEmulateGPU,
+                              bool                 emulateGpu,
                                const t_inputrec    *ir,
                                int                 *kernel_type,
                                int                 *ewald_excl,
@@ -1750,13 +1718,13 @@ static void pick_nbnxn_kernel(FILE                *fp,
      *kernel_type = nbnxnkNotSet;
      *ewald_excl  = ewaldexclTable;
  
-    if (bEmulateGPU)
+    if (emulateGpu)
      {
          *kernel_type = nbnxnk8x8x8_PlainC;
  
          if (bDoNonbonded)
          {
-            md_print_warn(cr, fp, "Emulating a GPU run on the CPU (slow)");
+            GMX_LOG(mdlog.warning).asParagraph().appendText("Emulating a GPU run on the CPU (slow)");
          }
      }
      else if (bUseGPU)
@@ -1767,7 +1735,7 @@ static void pick_nbnxn_kernel(FILE                *fp,
      if (*kernel_type == nbnxnkNotSet)
      {
          if (use_simd_kernels &&
-            nbnxn_simd_supported(fp, cr, ir))
+            nbnxn_simd_supported(mdlog, ir))
          {
              pick_nbnxn_kernel_cpu(ir, kernel_type, ewald_excl);
          }
@@ -1777,7 +1745,7 @@ static void pick_nbnxn_kernel(FILE                *fp,
          }
      }
  
-    if (bDoNonbonded && fp != NULL)
+    if (bDoNonbonded && fp != nullptr)
      {
          fprintf(fp, "\nUsing %s %dx%d non-bonded kernels\n\n",
                  lookup_nbnxn_kernel_name(*kernel_type),
@@ -1787,69 +1755,14 @@ static void pick_nbnxn_kernel(FILE                *fp,
          if (nbnxnk4x4_PlainC == *kernel_type ||
              nbnxnk8x8x8_PlainC == *kernel_type)
          {
-            md_print_warn(cr, fp,
-                          "WARNING: Using the slow %s kernels. This should\n"
-                          "not happen during routine usage on supported platforms.\n\n",
-                          lookup_nbnxn_kernel_name(*kernel_type));
+            GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
+                    "WARNING: Using the slow %s kernels. This should\n"
+                    "not happen during routine usage on supported platforms.",
+                    lookup_nbnxn_kernel_name(*kernel_type));
          }
      }
  }
  
-static void pick_nbnxn_resources(FILE                *fp,
-                                 const t_commrec     *cr,
-                                 const gmx_hw_info_t *hwinfo,
-                                 gmx_bool             bDoNonbonded,
-                                 gmx_bool            *bUseGPU,
-                                 gmx_bool            *bEmulateGPU,
-                                 const gmx_gpu_opt_t *gpu_opt)
-{
-    gmx_bool bEmulateGPUEnvVarSet;
-    char     gpu_err_str[STRLEN];
-
-    *bUseGPU = FALSE;
-
-    bEmulateGPUEnvVarSet = (getenv("GMX_EMULATE_GPU") != NULL);
-
-    /* Run GPU emulation mode if GMX_EMULATE_GPU is defined. Because
-     * GPUs (currently) only handle non-bonded calculations, we will
-     * automatically switch to emulation if non-bonded calculations are
-     * turned off via GMX_NO_NONBONDED - this is the simple and elegant
-     * way to turn off GPU initialization, data movement, and cleanup.
-     *
-     * GPU emulation can be useful to assess the performance one can expect by
-     * adding GPU(s) to the machine. The conditional below allows this even
-     * if mdrun is compiled without GPU acceleration support.
-     * Note that you should freezing the system as otherwise it will explode.
-     */
-    *bEmulateGPU = (bEmulateGPUEnvVarSet ||
-                    (!bDoNonbonded && gpu_opt->n_dev_use > 0));
-
-    /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
-     */
-    if (gpu_opt->n_dev_use > 0 && !(*bEmulateGPU))
-    {
-        /* Each PP node will use the intra-node id-th device from the
-         * list of detected/selected GPUs. */
-        if (!init_gpu(fp, cr->rank_pp_intranode, gpu_err_str,
-                      &hwinfo->gpu_info, gpu_opt))
-        {
-            /* At this point the init should never fail as we made sure that
-             * we have all the GPUs we need. If it still does, we'll bail. */
-            /* TODO the decorating of gpu_err_str is nicer if it
-               happens inside init_gpu. Out here, the decorating with
-               the MPI rank makes sense. */
-            gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s",
-                      cr->nodeid,
-                      get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
-                                        cr->rank_pp_intranode),
-                      gpu_err_str);
-        }
-
-        /* Here we actually turn on hardware GPU acceleration */
-        *bUseGPU = TRUE;
-    }
-}
-
  gmx_bool uses_simple_tables(int                 cutoff_scheme,
                              nonbonded_verlet_t *nbv,
                              int                 group)
@@ -1929,7 +1842,7 @@ void init_interaction_const_tables(FILE                *fp,
      {
          init_ewald_f_table(ic, rtab);
  
-        if (fp != NULL)
+        if (fp != nullptr)
          {
              fprintf(fp, "Initialized non-bonded Ewald correction tables, spacing: %.2e size: %d\n\n",
                      1/ic->tabq_scale, ic->tabq_size);
@@ -1999,8 +1912,6 @@ init_interaction_const(FILE                       *fp,
      snew_aligned(ic->tabq_coul_F, 16, 32);
      snew_aligned(ic->tabq_coul_V, 16, 32);
  
-    ic->rlist           = fr->rlist;
-
      /* Lennard-Jones */
      ic->vdwtype         = fr->vdwtype;
      ic->vdw_modifier    = fr->vdw_modifier;
@@ -2088,7 +1999,7 @@ init_interaction_const(FILE                       *fp,
          }
      }
  
-    if (fp != NULL)
+    if (fp != nullptr)
      {
          real dispersion_shift;
  
@@ -2114,55 +2025,68 @@ init_interaction_const(FILE                       *fp,
      *interaction_const = ic;
  }
  
+/* TODO deviceInfo should be logically const, but currently
+ * init_gpu modifies it to set up NVML support. This could
+ * happen during the detection phase, and deviceInfo could
+ * the become const. */
  static void init_nb_verlet(FILE                *fp,
+                           const gmx::MDLogger &mdlog,
                             nonbonded_verlet_t **nb_verlet,
                             gmx_bool             bFEP_NonBonded,
                             const t_inputrec    *ir,
                             const t_forcerec    *fr,
                             const t_commrec     *cr,
-                           const char          *nbpu_opt)
+                           const char          *nbpu_opt,
+                           gmx_device_info_t   *deviceInfo,
+                           const gmx_mtop_t    *mtop,
+                           matrix               box)
  {
      nonbonded_verlet_t *nbv;
      int                 i;
      char               *env;
-    gmx_bool            bEmulateGPU, bHybridGPURun = FALSE;
+    gmx_bool            bHybridGPURun = FALSE;
  
      nbnxn_alloc_t      *nb_alloc;
      nbnxn_free_t       *nb_free;
  
-    snew(nbv, 1);
+    nbv = new nonbonded_verlet_t();
  
-    pick_nbnxn_resources(fp, cr, fr->hwinfo,
-                         fr->bNonbonded,
-                         &nbv->bUseGPU,
-                         &bEmulateGPU,
-                         fr->gpu_opt);
+    nbv->emulateGpu = (getenv("GMX_EMULATE_GPU") != nullptr);
+    nbv->bUseGPU    = deviceInfo != nullptr;
  
-    nbv->nbs             = NULL;
+    GMX_RELEASE_ASSERT(!(nbv->emulateGpu && nbv->bUseGPU), "When GPU emulation is active, there cannot be a GPU assignment");
+
+    if (nbv->bUseGPU)
+    {
+        /* Use the assigned GPU. */
+        init_gpu(mdlog, cr->nodeid, deviceInfo);
+    }
+
+    nbv->nbs             = nullptr;
      nbv->min_ci_balanced = 0;
  
      nbv->ngrp = (DOMAINDECOMP(cr) ? 2 : 1);
      for (i = 0; i < nbv->ngrp; i++)
      {
          nbv->grp[i].nbl_lists.nnbl = 0;
-        nbv->grp[i].nbat           = NULL;
+        nbv->grp[i].nbat           = nullptr;
          nbv->grp[i].kernel_type    = nbnxnkNotSet;
  
          if (i == 0) /* local */
          {
-            pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels,
-                              nbv->bUseGPU, bEmulateGPU, ir,
+            pick_nbnxn_kernel(fp, mdlog, fr->use_simd_kernels,
+                              nbv->bUseGPU, nbv->emulateGpu, ir,
                                &nbv->grp[i].kernel_type,
                                &nbv->grp[i].ewald_excl,
                                fr->bNonbonded);
          }
          else /* non-local */
          {
-            if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0)
+            if (nbpu_opt != nullptr && strcmp(nbpu_opt, "gpu_cpu") == 0)
              {
                  /* Use GPU for local, select a CPU kernel for non-local */
-                pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels,
-                                  FALSE, FALSE, ir,
+                pick_nbnxn_kernel(fp, mdlog, fr->use_simd_kernels,
+                                  FALSE, false, ir,
                                    &nbv->grp[i].kernel_type,
                                    &nbv->grp[i].ewald_excl,
                                    fr->bNonbonded);
@@ -2178,9 +2102,13 @@ static void init_nb_verlet(FILE                *fp,
          }
      }
  
+    nbv->listParams = std::unique_ptr<NbnxnListParameters>(new NbnxnListParameters(ir->rlist));
+    setupDynamicPairlistPruning(fp, ir, mtop, box, nbv->bUseGPU, fr->ic,
+                                nbv->listParams.get());
+
      nbnxn_init_search(&nbv->nbs,
-                      DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
-                      DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
+                      DOMAINDECOMP(cr) ? &cr->dd->nc : nullptr,
+                      DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : nullptr,
                        bFEP_NonBonded,
                        gmx_omp_nthreads_get(emntPairsearch));
  
@@ -2206,7 +2134,7 @@ static void init_nb_verlet(FILE                *fp,
              if (fr->vdwtype == evdwCUT &&
                  (fr->vdw_modifier == eintmodNONE ||
                   fr->vdw_modifier == eintmodPOTSHIFT) &&
-                getenv("GMX_NO_LJ_COMB_RULE") == NULL)
+                getenv("GMX_NO_LJ_COMB_RULE") == nullptr)
              {
                  /* Plain LJ cut-off: we can optimize with combination rules */
                  enbnxninitcombrule = enbnxninitcombruleDETECT;
@@ -2251,11 +2179,10 @@ static void init_nb_verlet(FILE                *fp,
          /* init the NxN GPU data; the last argument tells whether we'll have
           * both local and non-local NB calculation on GPU */
          nbnxn_gpu_init(&nbv->gpu_nbv,
-                       &fr->hwinfo->gpu_info,
-                       fr->gpu_opt,
+                       deviceInfo,
                         fr->ic,
+                       nbv->listParams.get(),
                         nbv->grp,
-                       cr->rank_pp_intranode,
                         cr->nodeid,
                         (nbv->ngrp > 1) && !bHybridGPURun);
  
@@ -2278,7 +2205,7 @@ static void init_nb_verlet(FILE                *fp,
          }
  #endif  /* GMX_THREAD_MPI */
  
-        if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
+        if ((env = getenv("GMX_NB_MIN_CI")) != nullptr)
          {
              char *end;
  
@@ -2311,22 +2238,24 @@ static void init_nb_verlet(FILE                *fp,
  
  gmx_bool usingGpu(nonbonded_verlet_t *nbv)
  {
-    return nbv != NULL && nbv->bUseGPU;
+    return nbv != nullptr && nbv->bUseGPU;
  }
  
-void init_forcerec(FILE              *fp,
-                   t_forcerec        *fr,
-                   t_fcdata          *fcd,
-                   const t_inputrec  *ir,
-                   const gmx_mtop_t  *mtop,
-                   const t_commrec   *cr,
-                   matrix             box,
-                   const char        *tabfn,
-                   const char        *tabpfn,
-                   const t_filenm    *tabbfnm,
-                   const char        *nbpu_opt,
-                   gmx_bool           bNoSolvOpt,
-                   real               print_force)
+void init_forcerec(FILE                *fp,
+                   const gmx::MDLogger &mdlog,
+                   t_forcerec          *fr,
+                   t_fcdata            *fcd,
+                   const t_inputrec    *ir,
+                   const gmx_mtop_t    *mtop,
+                   const t_commrec     *cr,
+                   matrix               box,
+                   const char          *tabfn,
+                   const char          *tabpfn,
+                   const t_filenm      *tabbfnm,
+                   const char          *nbpu_opt,
+                   gmx_device_info_t   *deviceInfo,
+                   gmx_bool             bNoSolvOpt,
+                   real                 print_force)
  {
      int            i, m, negp_pp, negptable, egi, egj;
      real           rtab;
@@ -2338,15 +2267,6 @@ void init_forcerec(FILE              *fp,
      gmx_bool       bFEP_NonBonded;
      int           *nm_ind, egp_flags;
  
-    if (fr->hwinfo == NULL)
-    {
-        /* Detect hardware, gather information.
-         * In mdrun, hwinfo has already been set before calling init_forcerec.
-         * Here we ignore GPUs, as tools will not use them anyhow.
-         */
-        fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE);
-    }
-
      /* By default we turn SIMD kernels on, but it might be turned off further down... */
      fr->use_simd_kernels = TRUE;
  
@@ -2421,7 +2341,7 @@ void init_forcerec(FILE              *fp,
      fr->sc_sigma6_def = gmx::power6(ir->fepvals->sc_sigma);
  
      env = getenv("GMX_SCSIGMA_MIN");
-    if (env != NULL)
+    if (env != nullptr)
      {
          dbl = 0;
          sscanf(env, "%20lf", &dbl);
@@ -2433,13 +2353,13 @@ void init_forcerec(FILE              *fp,
      }
  
      fr->bNonbonded = TRUE;
-    if (getenv("GMX_NO_NONBONDED") != NULL)
+    if (getenv("GMX_NO_NONBONDED") != nullptr)
      {
          /* turn off non-bonded calculations */
          fr->bNonbonded = FALSE;
-        md_print_warn(cr, fp,
-                      "Found environment variable GMX_NO_NONBONDED.\n"
-                      "Disabling nonbonded calculations.\n");
+        GMX_LOG(mdlog.warning).asParagraph().appendText(
+                "Found environment variable GMX_NO_NONBONDED.\n"
+                "Disabling nonbonded calculations.");
      }
  
      bGenericKernelOnly = FALSE;
@@ -2448,9 +2368,9 @@ void init_forcerec(FILE              *fp,
       * can be used with water optimization, and disable it if that is not the case.
       */
  
-    if (getenv("GMX_NB_GENERIC") != NULL)
+    if (getenv("GMX_NB_GENERIC") != nullptr)
      {
-        if (fp != NULL)
+        if (fp != nullptr)
          {
              fprintf(fp,
                      "Found environment variable GMX_NB_GENERIC.\n"
@@ -2465,10 +2385,10 @@ void init_forcerec(FILE              *fp,
          bNoSolvOpt         = TRUE;
      }
  
-    if ( (getenv("GMX_DISABLE_SIMD_KERNELS") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
+    if ( (getenv("GMX_DISABLE_SIMD_KERNELS") != nullptr) || (getenv("GMX_NOOPTIMIZEDKERNELS") != nullptr) )
      {
          fr->use_simd_kernels = FALSE;
-        if (fp != NULL)
+        if (fp != nullptr)
          {
              fprintf(fp,
                      "\nFound environment variable GMX_DISABLE_SIMD_KERNELS.\n"
@@ -2480,16 +2400,16 @@ void init_forcerec(FILE              *fp,
      fr->bBHAM = (mtop->ffparams.functype[0] == F_BHAM);
  
      /* Check if we can/should do all-vs-all kernels */
-    fr->bAllvsAll       = can_use_allvsall(ir, FALSE, NULL, NULL);
-    fr->AllvsAll_work   = NULL;
-    fr->AllvsAll_workgb = NULL;
+    fr->bAllvsAll       = can_use_allvsall(ir, FALSE, nullptr, nullptr);
+    fr->AllvsAll_work   = nullptr;
+    fr->AllvsAll_workgb = nullptr;
  
      /* All-vs-all kernels have not been implemented in 4.6 and later.
       * See Redmine #1249. */
      if (fr->bAllvsAll)
      {
          fr->bAllvsAll            = FALSE;
-        if (fp != NULL)
+        if (fp != nullptr)
          {
              fprintf(fp,
                      "\nYour simulation settings would have triggered the efficient all-vs-all\n"
@@ -2513,7 +2433,7 @@ void init_forcerec(FILE              *fp,
          {
              fprintf(stderr, "\n%s\n", note);
          }
-        if (fp != NULL)
+        if (fp != nullptr)
          {
              fprintf(fp, "\n%s\n", note);
          }
@@ -2551,22 +2471,27 @@ void init_forcerec(FILE              *fp,
              }
              else
              {
-                fr->bMolPBC = TRUE;
+                /* Not making molecules whole is faster in most cases,
+                 * but With orientation restraints we need whole molecules.
+                 */
+                fr->bMolPBC = (fcd->orires.nr == 0);
  
-                if (getenv("GMX_USE_GRAPH") != NULL)
+                if (getenv("GMX_USE_GRAPH") != nullptr)
                  {
                      fr->bMolPBC = FALSE;
                      if (fp)
                      {
-                        md_print_warn(cr, fp, "GMX_USE_GRAPH is set, using the graph for bonded interactions\n");
+                        GMX_LOG(mdlog.warning).asParagraph().appendText("GMX_USE_GRAPH is set, using the graph for bonded interactions");
                      }
  
                      if (mtop->bIntermolecularInteractions)
                      {
-                        md_print_warn(cr, fp, "WARNING: Molecules linked by intermolecular interactions have to reside in the same periodic image, otherwise artifacts will occur!\n");
+                        GMX_LOG(mdlog.warning).asParagraph().appendText("WARNING: Molecules linked by intermolecular interactions have to reside in the same periodic image, otherwise artifacts will occur!");
                      }
                  }
  
+                GMX_RELEASE_ASSERT(fr->bMolPBC || !mtop->bIntermolecularInteractions, "We need to use PBC within molecules with inter-molecular interactions");
+
                  if (bSHAKE && fr->bMolPBC)
                  {
                      gmx_fatal(FARGS, "SHAKE is not properly supported with intermolecular interactions. For short simulations where linked molecules remain in the same periodic image, the environment variable GMX_USE_GRAPH can be used to override this check.\n");
@@ -2754,6 +2679,9 @@ void init_forcerec(FILE              *fp,
          fr->bcoultab = FALSE;
      }
  
+    /* This now calculates sum for q and C6 */
+    set_chargesum(fp, fr, mtop);
+
      /* Tables are used for direct ewald sum */
      if (fr->bEwald)
      {
@@ -2775,11 +2703,18 @@ void init_forcerec(FILE              *fp,
  
              if (ir->ewald_geometry == eewg3DC)
              {
+                bool haveNetCharge = (fabs(fr->qsum[0]) > 1e-4 ||
+                                      fabs(fr->qsum[1]) > 1e-4);
                  if (fp)
                  {
-                    fprintf(fp, "Using the Ewald3DC correction for systems with a slab geometry.\n");
+                    fprintf(fp, "Using the Ewald3DC correction for systems with a slab geometry%s.\n",
+                            haveNetCharge ? " and net charge" : "");
                  }
                  please_cite(fp, "In-Chul99a");
+                if (haveNetCharge)
+                {
+                    please_cite(fp, "Ballenegger2009");
+                }
              }
          }
          fr->ewaldcoeff_q = calc_ewaldcoeff_q(ir->rcoulomb, ir->ewald_rtol);
@@ -2821,10 +2756,14 @@ void init_forcerec(FILE              *fp,
      }
  
      fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype) ||
+                       fr->forceProviders->hasForcesWithoutVirialContribution() ||
                         gmx_mtop_ftype_count(mtop, F_POSRES) > 0 ||
-                       gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0 ||
-                       inputrecElecField(ir)
-                       );
+                       gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0);
+
+    if (fr->bF_NoVirSum)
+    {
+        fr->forceBufferNoVirialSummation = new PaddedRVecVector;
+    }
  
      if (fr->cutoff_scheme == ecutsGROUP &&
          ncg_mtop(mtop) > fr->cg_nalloc && !DOMAINDECOMP(cr))
@@ -2833,17 +2772,17 @@ void init_forcerec(FILE              *fp,
          fr->cg_nalloc = ncg_mtop(mtop);
          srenew(fr->cg_cm, fr->cg_nalloc);
      }
-    if (fr->shift_vec == NULL)
+    if (fr->shift_vec == nullptr)
      {
          snew(fr->shift_vec, SHIFTS);
      }
  
-    if (fr->fshift == NULL)
+    if (fr->fshift == nullptr)
      {
          snew(fr->fshift, SHIFTS);
      }
  
-    if (fr->nbfp == NULL)
+    if (fr->nbfp == nullptr)
      {
          fr->ntype = mtop->ffparams.atnr;
          fr->nbfp  = mk_nbfp(&mtop->ffparams, fr->bBHAM);
@@ -2980,9 +2919,6 @@ void init_forcerec(FILE              *fp,
                     &fr->kappa, &fr->k_rf, &fr->c_rf);
      }
  
-    /*This now calculates sum for q and c6*/
-    set_chargesum(fp, fr, mtop);
-
      /* Construct tables for the group scheme. A little unnecessary to
       * make both vdw and coul tables sometimes, but what the
       * heck. Note that both cutoff schemes construct Ewald tables in
@@ -3045,7 +2981,7 @@ void init_forcerec(FILE              *fp,
          /* make tables for ordinary interactions */
          if (bSomeNormalNbListsAreInUse)
          {
-            make_nbf_tables(fp, fr, rtab, tabfn, NULL, NULL, &fr->nblists[0]);
+            make_nbf_tables(fp, fr, rtab, tabfn, nullptr, nullptr, &fr->nblists[0]);
              m = 1;
          }
          else
@@ -3151,7 +3087,7 @@ void init_forcerec(FILE              *fp,
                                     &fr->bExcl_IntraCGAll_InterCGNone);
      if (DOMAINDECOMP(cr))
      {
-        fr->cginfo = NULL;
+        fr->cginfo = nullptr;
      }
      else
      {
@@ -3205,7 +3141,9 @@ void init_forcerec(FILE              *fp,
              GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw, "With Verlet lists and no PME rcoulomb and rvdw should be identical");
          }
  
-        init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
+        init_nb_verlet(fp, mdlog, &fr->nbv, bFEP_NonBonded, ir, fr,
+                       cr, nbpu_opt, deviceInfo,
+                       mtop, box);
      }
  
      if (ir->eDispCorr != edispcNO)
@@ -3244,10 +3182,9 @@ void pr_forcerec(FILE *fp, t_forcerec *fr)
   * in this run because the PME ranks have no knowledge of whether GPUs
   * are used or not, but all ranks need to enter the barrier below.
   */
-void free_gpu_resources(const t_forcerec     *fr,
-                        const t_commrec      *cr,
-                        const gmx_gpu_info_t *gpu_info,
-                        const gmx_gpu_opt_t  *gpu_opt)
+void free_gpu_resources(const t_forcerec        *fr,
+                        const t_commrec         *cr,
+                        const gmx_device_info_t *deviceInfo)
  {
      gmx_bool bIsPPrankUsingGPU;
      char     gpu_err_str[STRLEN];
@@ -3260,26 +3197,29 @@ void free_gpu_resources(const t_forcerec     *fr,
          nbnxn_gpu_free(fr->nbv->gpu_nbv);
          /* stop the GPU profiler (only CUDA) */
          stopGpuProfiler();
+    }
  
-        /* With tMPI we need to wait for all ranks to finish deallocation before
-         * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
-         * GPU and context.
-         *
-         * This is not a concern in OpenCL where we use one context per rank which
-         * is freed in nbnxn_gpu_free().
-         *
-         * Note: as only PP ranks need to free GPU resources, so it is safe to
-         * not call the barrier on PME ranks.
-         */
+    /* With tMPI we need to wait for all ranks to finish deallocation before
+     * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
+     * GPU and context.
+     *
+     * This is not a concern in OpenCL where we use one context per rank which
+     * is freed in nbnxn_gpu_free().
+     *
+     * Note: it is safe to not call the barrier on the ranks which do not use GPU,
+     * but it is easier and more futureproof to call it on the whole node.
+     */
  #if GMX_THREAD_MPI
-        if (PAR(cr))
-        {
-            gmx_barrier(cr);
-        }
+    if (PAR(cr) || MULTISIM(cr))
+    {
+        gmx_barrier_physical_node(cr);
+    }
  #endif  /* GMX_THREAD_MPI */
  
+    if (bIsPPrankUsingGPU)
+    {
          /* uninitialize GPU (by destroying the context) */
-        if (!free_cuda_gpu(cr->rank_pp_intranode, gpu_err_str, gpu_info, gpu_opt))
+        if (!free_cuda_gpu(deviceInfo, gpu_err_str))
          {
              gmx_warning("On rank %d failed to free GPU #%d: %s",
                          cr->nodeid, get_current_cuda_gpu_device_id(), gpu_err_str);