Merge release-5-0 into master

[alexxy/gromacs.git] / src / programs / mdrun / pme_loadbal.c
diff --git a/src/programs/mdrun/pme_loadbal.c b/src/programs/mdrun/pme_loadbal.c

index 876c7881429e1983da00aabacba2b99c40f0d4c8..dd2cd1a2e9f049b01ff6de90078d249f925c0cf7 100644 (file)
--- a/src/programs/mdrun/pme_loadbal.c
+++ b/src/programs/mdrun/pme_loadbal.c
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -32,22 +32,27 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include "smalloc.h"
-#include "network.h"
-#include "calcgrid.h"
-#include "pme.h"
-#include "vec.h"
-#include "domdec.h"
-#include "nbnxn_cuda_data_mgmt.h"
-#include "force.h"
-#include "macros.h"
-#include "md_logging.h"
+#include "gmxpre.h"
+
  #include "pme_loadbal.h"
  
+#include "config.h"
+
+#include "gromacs/legacyheaders/calcgrid.h"
+#include "gromacs/legacyheaders/domdec.h"
+#include "gromacs/legacyheaders/force.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/md_logging.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/pme.h"
+#include "gromacs/legacyheaders/sim_util.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/smalloc.h"
+
  /* Parameters and setting for one PP-PME setup */
  typedef struct {
      real      rcut_coulomb;    /* Coulomb cut-off                              */
@@ -260,6 +265,15 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t  pme_lb,
      while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing || !grid_ok);
  
      set->rcut_coulomb = pme_lb->cut_spacing*sp;
+    if (set->rcut_coulomb < pme_lb->rcut_coulomb_start)
+    {
+        /* This is unlikely, but can happen when e.g. continuing from
+         * a checkpoint after equilibration where the box shrank a lot.
+         * We want to avoid rcoulomb getting smaller than rvdw
+         * and there might be more issues with decreasing rcoulomb.
+         */
+        set->rcut_coulomb = pme_lb->rcut_coulomb_start;
+    }
  
      if (pme_lb->cutoff_scheme == ecutsVERLET)
      {
@@ -317,6 +331,9 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t  pme_lb,
      /* The Ewald coefficient is inversly proportional to the cut-off */
      set->ewaldcoeff_q =
          pme_lb->setup[0].ewaldcoeff_q*pme_lb->setup[0].rcut_coulomb/set->rcut_coulomb;
+    /* We set ewaldcoeff_lj in set, even when LJ-PME is not used */
+    set->ewaldcoeff_lj =
+        pme_lb->setup[0].ewaldcoeff_lj*pme_lb->setup[0].rcut_coulomb/set->rcut_coulomb;
  
      set->count   = 0;
      set->cycles  = 0;
@@ -373,7 +390,7 @@ static int pme_loadbal_end(pme_load_balancing_t pme_lb)
  }
  
  static void print_loadbal_limited(FILE *fp_err, FILE *fp_log,
-                                  gmx_large_int_t step,
+                                  gmx_int64_t step,
                                    pme_load_balancing_t pme_lb)
  {
      char buf[STRLEN], sbuf[22];
@@ -423,17 +440,17 @@ static void switch_to_stage1(pme_load_balancing_t pme_lb)
      pme_lb->cur = pme_lb->start - 1;
  }
  
-gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
-                          t_commrec           *cr,
-                          FILE                *fp_err,
-                          FILE                *fp_log,
-                          t_inputrec          *ir,
-                          t_state             *state,
-                          double               cycles,
-                          interaction_const_t *ic,
-                          nonbonded_verlet_t  *nbv,
-                          gmx_pme_t           *pmedata,
-                          gmx_large_int_t      step)
+gmx_bool pme_load_balance(pme_load_balancing_t        pme_lb,
+                          t_commrec                  *cr,
+                          FILE                       *fp_err,
+                          FILE                       *fp_log,
+                          t_inputrec                 *ir,
+                          t_state                    *state,
+                          double                      cycles,
+                          interaction_const_t        *ic,
+                          struct nonbonded_verlet_t  *nbv,
+                          gmx_pme_t                  *pmedata,
+                          gmx_int64_t                 step)
  {
      gmx_bool     OK;
      pme_setup_t *set;
@@ -662,43 +679,55 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
      ic->rlistlong    = set->rlistlong;
      ir->nstcalclr    = set->nstcalclr;
      ic->ewaldcoeff_q = set->ewaldcoeff_q;
-
-    bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
-    if (pme_lb->cutoff_scheme == ecutsVERLET &&
-        nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
-    {
-        nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic);
-
-        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
-         * also sharing texture references. To keep the code simple, we don't
-         * treat texture references as shared resources, but this means that
-         * the coulomb_tab texture ref will get updated by multiple threads.
-         * Hence, to ensure that the non-bonded kernels don't start before all
-         * texture binding operations are finished, we need to wait for all ranks
-         * to arrive here before continuing.
-         *
-         * Note that we could omit this barrier if GPUs are not shared (or
-         * texture objects are used), but as this is initialization code, there
-         * is not point in complicating things.
-         */
-#ifdef GMX_THREAD_MPI
-        if (PAR(cr))
-        {
-            gmx_barrier(cr);
-        }
-#endif  /* GMX_THREAD_MPI */
+    /* TODO: centralize the code that sets the potentials shifts */
+    if (ic->coulomb_modifier == eintmodPOTSHIFT)
+    {
+        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb);
      }
-    else
+    if (EVDW_PME(ic->vdwtype))
      {
-        init_interaction_const_tables(NULL, ic, bUsesSimpleTables,
-                                      rtab);
+        /* We have PME for both Coulomb and VdW, set rvdw equal to rcoulomb */
+        ic->rvdw            = set->rcut_coulomb;
+        ic->ewaldcoeff_lj   = set->ewaldcoeff_lj;
+        if (ic->vdw_modifier == eintmodPOTSHIFT)
+        {
+            real crc2;
+
+            ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0);
+            ic->repulsion_shift.cpot  = -pow(ic->rvdw, -12.0);
+            ic->sh_invrc6             = -ic->dispersion_shift.cpot;
+            crc2                      = sqr(ic->ewaldcoeff_lj*ic->rvdw);
+            ic->sh_lj_ewald           = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0);
+        }
      }
  
-    if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->ngrp > 1)
+    bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
+    nbnxn_cuda_pme_loadbal_update_param(nbv, ic);
+
+    /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+     * also sharing texture references. To keep the code simple, we don't
+     * treat texture references as shared resources, but this means that
+     * the coulomb_tab texture ref will get updated by multiple threads.
+     * Hence, to ensure that the non-bonded kernels don't start before all
+     * texture binding operations are finished, we need to wait for all ranks
+     * to arrive here before continuing.
+     *
+     * Note that we could omit this barrier if GPUs are not shared (or
+     * texture objects are used), but as this is initialization code, there
+     * is not point in complicating things.
+     */
+#ifdef GMX_THREAD_MPI
+    if (PAR(cr) && use_GPU(nbv))
      {
-        init_interaction_const_tables(NULL, ic, bUsesSimpleTables,
-                                      rtab);
+        gmx_barrier(cr);
      }
+#endif  /* GMX_THREAD_MPI */
+
+    /* Usually we won't need the simple tables with GPUs.
+     * But we do with hybrid acceleration and with free energy.
+     * To avoid bugs, we always re-initialize the simple tables here.
+     */
+    init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab);
  
      if (cr->duty & DUTY_PME)
      {
@@ -810,8 +839,8 @@ static void print_pme_loadbal_settings(pme_load_balancing_t pme_lb,
      {
          md_print_warn(cr, fplog,
                        "NOTE: PME load balancing increased the non-bonded workload by more than 50%%.\n"
-                      "      For better performance use (more) PME nodes (mdrun -npme),\n"
-                      "      or in case you are beyond the scaling limit, use less nodes in total.\n");
+                      "      For better performance, use (more) PME ranks (mdrun -npme),\n"
+                      "      or if you are beyond the scaling limit, use fewer total ranks (or nodes).\n");
      }
      else
      {