Merge release-5-0 into master

[alexxy/gromacs.git] / src / programs / mdrun / pme_loadbal.c
diff --git a/src/programs/mdrun/pme_loadbal.c b/src/programs/mdrun/pme_loadbal.c

index 8a0cf21554c098405607cf961dd1ecf606f7f033..dd2cd1a2e9f049b01ff6de90078d249f925c0cf7 100644 (file)
--- a/src/programs/mdrun/pme_loadbal.c
+++ b/src/programs/mdrun/pme_loadbal.c
@@ -1,53 +1,58 @@
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+/*
+ * This file is part of the GROMACS molecular simulation package.
   *
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
   *
- *                This source code is part of
- *
- *                 G   R   O   M   A   C   S
- *
- *          GROningen MAchine for Chemical Simulations
- *
- *                        VERSION 4.6.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
- * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2011, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
   * of the License, or (at your option) any later version.
   *
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
   *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
   *
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
   *
- * And Hey:
- * Gallium Rubidium Oxygen Manganese Argon Carbon Silicon
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
   */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
-
-#include "smalloc.h"
-#include "network.h"
-#include "calcgrid.h"
-#include "pme.h"
-#include "vec.h"
-#include "domdec.h"
-#include "nbnxn_cuda_data_mgmt.h"
-#include "force.h"
-#include "macros.h"
+#include "gmxpre.h"
+
  #include "pme_loadbal.h"
  
+#include "config.h"
+
+#include "gromacs/legacyheaders/calcgrid.h"
+#include "gromacs/legacyheaders/domdec.h"
+#include "gromacs/legacyheaders/force.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/md_logging.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/pme.h"
+#include "gromacs/legacyheaders/sim_util.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/smalloc.h"
+
  /* Parameters and setting for one PP-PME setup */
  typedef struct {
      real      rcut_coulomb;    /* Coulomb cut-off                              */
@@ -57,9 +62,9 @@ typedef struct {
      real      spacing;         /* (largest) PME grid spacing                   */
      ivec      grid;            /* the PME grid dimensions                      */
      real      grid_efficiency; /* ineffiency factor for non-uniform grids <= 1 */
-    real      ewaldcoeff;      /* the Ewald coefficient                        */
+    real      ewaldcoeff_q;    /* Electrostatic Ewald coefficient            */
+    real      ewaldcoeff_lj;   /* LJ Ewald coefficient, only for the call to send_switchgrid */
      gmx_pme_t pmedata;         /* the data structure used in the PME code      */
-
      int       count;           /* number of times this setup has been timed    */
      double    cycles;          /* the fastest time for this setup in cycles    */
  } pme_setup_t;
@@ -78,11 +83,11 @@ typedef struct {
  #define PME_LB_ACCEL_TOL 1.02
  
  enum {
-    epmelblimNO, epmelblimBOX, epmelblimDD, epmelblimNR
+    epmelblimNO, epmelblimBOX, epmelblimDD, epmelblimPMEGRID, epmelblimNR
  };
  
  const char *pmelblim_str[epmelblimNR] =
-{ "no", "box size", "domain decompostion" };
+{ "no", "box size", "domain decompostion", "PME grid restriction" };
  
  struct pme_load_balancing {
      int          nstage;             /* the current maximum number of stages */
@@ -156,19 +161,20 @@ void pme_loadbal_init(pme_load_balancing_t *pme_lb_p,
      pme_lb->n = 1;
      snew(pme_lb->setup, pme_lb->n);
  
-    pme_lb->rcut_vdw              = ic->rvdw;
-    pme_lb->rcut_coulomb_start    = ir->rcoulomb;
-    pme_lb->nstcalclr_start       = ir->nstcalclr;
-
-    pme_lb->cur                   = 0;
-    pme_lb->setup[0].rcut_coulomb = ic->rcoulomb;
-    pme_lb->setup[0].rlist        = ic->rlist;
-    pme_lb->setup[0].rlistlong    = ic->rlistlong;
-    pme_lb->setup[0].nstcalclr    = ir->nstcalclr;
-    pme_lb->setup[0].grid[XX]     = ir->nkx;
-    pme_lb->setup[0].grid[YY]     = ir->nky;
-    pme_lb->setup[0].grid[ZZ]     = ir->nkz;
-    pme_lb->setup[0].ewaldcoeff   = ic->ewaldcoeff;
+    pme_lb->rcut_vdw                 = ic->rvdw;
+    pme_lb->rcut_coulomb_start       = ir->rcoulomb;
+    pme_lb->nstcalclr_start          = ir->nstcalclr;
+
+    pme_lb->cur                      = 0;
+    pme_lb->setup[0].rcut_coulomb    = ic->rcoulomb;
+    pme_lb->setup[0].rlist           = ic->rlist;
+    pme_lb->setup[0].rlistlong       = ic->rlistlong;
+    pme_lb->setup[0].nstcalclr       = ir->nstcalclr;
+    pme_lb->setup[0].grid[XX]        = ir->nkx;
+    pme_lb->setup[0].grid[YY]        = ir->nky;
+    pme_lb->setup[0].grid[ZZ]        = ir->nkz;
+    pme_lb->setup[0].ewaldcoeff_q    = ic->ewaldcoeff_q;
+    pme_lb->setup[0].ewaldcoeff_lj   = ic->ewaldcoeff_lj;
  
      pme_lb->setup[0].pmedata  = pmedata;
  
@@ -202,13 +208,16 @@ void pme_loadbal_init(pme_load_balancing_t *pme_lb_p,
      *pme_lb_p = pme_lb;
  }
  
-static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t pme_lb,
-                                            int                  pme_order)
+static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t  pme_lb,
+                                            int                   pme_order,
+                                            const gmx_domdec_t   *dd)
  {
      pme_setup_t *set;
+    int          npmenodes_x, npmenodes_y;
      real         fac, sp;
      real         tmpr_coulomb, tmpr_vdw;
      int          d;
+    gmx_bool     grid_ok;
  
      /* Try to add a new setup with next larger cut-off to the list */
      pme_lb->n++;
@@ -216,9 +225,23 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t pme_lb,
      set          = &pme_lb->setup[pme_lb->n-1];
      set->pmedata = NULL;
  
+    get_pme_nnodes(dd, &npmenodes_x, &npmenodes_y);
+
      fac = 1;
      do
      {
+        /* Avoid infinite while loop, which can occur at the minimum grid size.
+         * Note that in practice load balancing will stop before this point.
+         * The factor 2.1 allows for the extreme case in which only grids
+         * of powers of 2 are allowed (the current code supports more grids).
+         */
+        if (fac > 2.1)
+        {
+            pme_lb->n--;
+
+            return FALSE;
+        }
+
          fac *= 1.01;
          clear_ivec(set->grid);
          sp = calc_grid(NULL, pme_lb->box_start,
@@ -227,22 +250,30 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t pme_lb,
                         &set->grid[YY],
                         &set->grid[ZZ]);
  
-        /* In parallel we can't have grids smaller than 2*pme_order,
-         * and we would anyhow not gain much speed at these grid sizes.
+        /* As here we can't easily check if one of the PME nodes
+         * uses threading, we do a conservative grid check.
+         * This means we can't use pme_order or less grid lines
+         * per PME node along x, which is not a strong restriction.
           */
-        for (d = 0; d < DIM; d++)
-        {
-            if (set->grid[d] <= 2*pme_order)
-            {
-                pme_lb->n--;
-
-                return FALSE;
-            }
-        }
+        gmx_pme_check_restrictions(pme_order,
+                                   set->grid[XX], set->grid[YY], set->grid[ZZ],
+                                   npmenodes_x, npmenodes_y,
+                                   TRUE,
+                                   FALSE,
+                                   &grid_ok);
      }
-    while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing);
+    while (sp <= 1.001*pme_lb->setup[pme_lb->cur].spacing || !grid_ok);
  
      set->rcut_coulomb = pme_lb->cut_spacing*sp;
+    if (set->rcut_coulomb < pme_lb->rcut_coulomb_start)
+    {
+        /* This is unlikely, but can happen when e.g. continuing from
+         * a checkpoint after equilibration where the box shrank a lot.
+         * We want to avoid rcoulomb getting smaller than rvdw
+         * and there might be more issues with decreasing rcoulomb.
+         */
+        set->rcut_coulomb = pme_lb->rcut_coulomb_start;
+    }
  
      if (pme_lb->cutoff_scheme == ecutsVERLET)
      {
@@ -298,8 +329,11 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t pme_lb,
          set->grid_efficiency *= (set->grid[d]*sp)/norm(pme_lb->box_start[d]);
      }
      /* The Ewald coefficient is inversly proportional to the cut-off */
-    set->ewaldcoeff =
-        pme_lb->setup[0].ewaldcoeff*pme_lb->setup[0].rcut_coulomb/set->rcut_coulomb;
+    set->ewaldcoeff_q =
+        pme_lb->setup[0].ewaldcoeff_q*pme_lb->setup[0].rcut_coulomb/set->rcut_coulomb;
+    /* We set ewaldcoeff_lj in set, even when LJ-PME is not used */
+    set->ewaldcoeff_lj =
+        pme_lb->setup[0].ewaldcoeff_lj*pme_lb->setup[0].rcut_coulomb/set->rcut_coulomb;
  
      set->count   = 0;
      set->cycles  = 0;
@@ -356,12 +390,12 @@ static int pme_loadbal_end(pme_load_balancing_t pme_lb)
  }
  
  static void print_loadbal_limited(FILE *fp_err, FILE *fp_log,
-                                  gmx_large_int_t step,
+                                  gmx_int64_t step,
                                    pme_load_balancing_t pme_lb)
  {
      char buf[STRLEN], sbuf[22];
  
-    sprintf(buf, "step %4s: the %s limited the PME load balancing to a coulomb cut-off of %.3f",
+    sprintf(buf, "step %4s: the %s limits the PME load balancing to a coulomb cut-off of %.3f",
              gmx_step_str(step, sbuf),
              pmelblim_str[pme_lb->elimited],
              pme_lb->setup[pme_loadbal_end(pme_lb)-1].rcut_coulomb);
@@ -406,17 +440,17 @@ static void switch_to_stage1(pme_load_balancing_t pme_lb)
      pme_lb->cur = pme_lb->start - 1;
  }
  
-gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
-                          t_commrec           *cr,
-                          FILE                *fp_err,
-                          FILE                *fp_log,
-                          t_inputrec          *ir,
-                          t_state             *state,
-                          double               cycles,
-                          interaction_const_t *ic,
-                          nonbonded_verlet_t  *nbv,
-                          gmx_pme_t           *pmedata,
-                          gmx_large_int_t      step)
+gmx_bool pme_load_balance(pme_load_balancing_t        pme_lb,
+                          t_commrec                  *cr,
+                          FILE                       *fp_err,
+                          FILE                       *fp_log,
+                          t_inputrec                 *ir,
+                          t_state                    *state,
+                          double                      cycles,
+                          interaction_const_t        *ic,
+                          struct nonbonded_verlet_t  *nbv,
+                          gmx_pme_t                  *pmedata,
+                          gmx_int64_t                 step)
  {
      gmx_bool     OK;
      pme_setup_t *set;
@@ -529,7 +563,12 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
              else
              {
                  /* Find the next setup */
-                OK = pme_loadbal_increase_cutoff(pme_lb, ir->pme_order);
+                OK = pme_loadbal_increase_cutoff(pme_lb, ir->pme_order, cr->dd);
+
+                if (!OK)
+                {
+                    pme_lb->elimited = epmelblimPMEGRID;
+                }
              }
  
              if (OK && ir->ePBC != epbcNONE)
@@ -635,29 +674,60 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
  
      set = &pme_lb->setup[pme_lb->cur];
  
-    ic->rcoulomb   = set->rcut_coulomb;
-    ic->rlist      = set->rlist;
-    ic->rlistlong  = set->rlistlong;
-    ir->nstcalclr  = set->nstcalclr;
-    ic->ewaldcoeff = set->ewaldcoeff;
-
-    bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
-    if (pme_lb->cutoff_scheme == ecutsVERLET &&
-        nbv->grp[0].kernel_type == nbnxnk8x8x8_CUDA)
+    ic->rcoulomb     = set->rcut_coulomb;
+    ic->rlist        = set->rlist;
+    ic->rlistlong    = set->rlistlong;
+    ir->nstcalclr    = set->nstcalclr;
+    ic->ewaldcoeff_q = set->ewaldcoeff_q;
+    /* TODO: centralize the code that sets the potentials shifts */
+    if (ic->coulomb_modifier == eintmodPOTSHIFT)
      {
-        nbnxn_cuda_pme_loadbal_update_param(nbv->cu_nbv, ic);
+        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb);
      }
-    else
+    if (EVDW_PME(ic->vdwtype))
      {
-        init_interaction_const_tables(NULL, ic, bUsesSimpleTables,
-                                      rtab);
+        /* We have PME for both Coulomb and VdW, set rvdw equal to rcoulomb */
+        ic->rvdw            = set->rcut_coulomb;
+        ic->ewaldcoeff_lj   = set->ewaldcoeff_lj;
+        if (ic->vdw_modifier == eintmodPOTSHIFT)
+        {
+            real crc2;
+
+            ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0);
+            ic->repulsion_shift.cpot  = -pow(ic->rvdw, -12.0);
+            ic->sh_invrc6             = -ic->dispersion_shift.cpot;
+            crc2                      = sqr(ic->ewaldcoeff_lj*ic->rvdw);
+            ic->sh_lj_ewald           = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0);
+        }
      }
  
-    if (pme_lb->cutoff_scheme == ecutsVERLET && nbv->ngrp > 1)
+    bUsesSimpleTables = uses_simple_tables(ir->cutoff_scheme, nbv, 0);
+    nbnxn_cuda_pme_loadbal_update_param(nbv, ic);
+
+    /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+     * also sharing texture references. To keep the code simple, we don't
+     * treat texture references as shared resources, but this means that
+     * the coulomb_tab texture ref will get updated by multiple threads.
+     * Hence, to ensure that the non-bonded kernels don't start before all
+     * texture binding operations are finished, we need to wait for all ranks
+     * to arrive here before continuing.
+     *
+     * Note that we could omit this barrier if GPUs are not shared (or
+     * texture objects are used), but as this is initialization code, there
+     * is not point in complicating things.
+     */
+#ifdef GMX_THREAD_MPI
+    if (PAR(cr) && use_GPU(nbv))
      {
-        init_interaction_const_tables(NULL, ic, bUsesSimpleTables,
-                                      rtab);
+        gmx_barrier(cr);
      }
+#endif  /* GMX_THREAD_MPI */
+
+    /* Usually we won't need the simple tables with GPUs.
+     * But we do with hybrid acceleration and with free energy.
+     * To avoid bugs, we always re-initialize the simple tables here.
+     */
+    init_interaction_const_tables(NULL, ic, bUsesSimpleTables, rtab);
  
      if (cr->duty & DUTY_PME)
      {
@@ -675,7 +745,7 @@ gmx_bool pme_load_balance(pme_load_balancing_t pme_lb,
      else
      {
          /* Tell our PME-only node to switch grid */
-        gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff);
+        gmx_pme_send_switchgrid(cr, set->grid, set->ewaldcoeff_q, set->ewaldcoeff_lj);
      }
  
      if (debug)
@@ -701,6 +771,22 @@ static int pme_grid_points(const pme_setup_t *setup)
      return setup->grid[XX]*setup->grid[YY]*setup->grid[ZZ];
  }
  
+static real pme_loadbal_rlist(const pme_setup_t *setup)
+{
+    /* With the group cut-off scheme we can have twin-range either
+     * for Coulomb or for VdW, so we need a check here.
+     * With the Verlet cut-off scheme rlist=rlistlong.
+     */
+    if (setup->rcut_coulomb > setup->rlist)
+    {
+        return setup->rlistlong;
+    }
+    else
+    {
+        return setup->rlist;
+    }
+}
+
  static void print_pme_loadbal_setting(FILE              *fplog,
                                        char              *name,
                                        const pme_setup_t *setup)
@@ -708,17 +794,19 @@ static void print_pme_loadbal_setting(FILE              *fplog,
      fprintf(fplog,
              "   %-7s %6.3f nm %6.3f nm     %3d %3d %3d   %5.3f nm  %5.3f nm\n",
              name,
-            setup->rcut_coulomb, setup->rlist,
+            setup->rcut_coulomb, pme_loadbal_rlist(setup),
              setup->grid[XX], setup->grid[YY], setup->grid[ZZ],
-            setup->spacing, 1/setup->ewaldcoeff);
+            setup->spacing, 1/setup->ewaldcoeff_q);
  }
  
  static void print_pme_loadbal_settings(pme_load_balancing_t pme_lb,
-                                       FILE                *fplog)
+                                       t_commrec           *cr,
+                                       FILE                *fplog,
+                                       gmx_bool             bNonBondedOnGPU)
  {
      double pp_ratio, grid_ratio;
  
-    pp_ratio   = pow(pme_lb->setup[pme_lb->cur].rlist/pme_lb->setup[0].rlistlong, 3.0);
+    pp_ratio   = pow(pme_loadbal_rlist(&pme_lb->setup[pme_lb->cur])/pme_loadbal_rlist(&pme_lb->setup[0]), 3.0);
      grid_ratio = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
          (double)pme_grid_points(&pme_lb->setup[0]);
  
@@ -746,14 +834,27 @@ static void print_pme_loadbal_settings(pme_load_balancing_t pme_lb,
      fprintf(fplog, " cost-ratio           %4.2f             %4.2f\n",
              pp_ratio, grid_ratio);
      fprintf(fplog, " (note that these numbers concern only part of the total PP and PME load)\n");
-    fprintf(fplog, "\n");
+
+    if (pp_ratio > 1.5 && !bNonBondedOnGPU)
+    {
+        md_print_warn(cr, fplog,
+                      "NOTE: PME load balancing increased the non-bonded workload by more than 50%%.\n"
+                      "      For better performance, use (more) PME ranks (mdrun -npme),\n"
+                      "      or if you are beyond the scaling limit, use fewer total ranks (or nodes).\n");
+    }
+    else
+    {
+        fprintf(fplog, "\n");
+    }
  }
  
-void pme_loadbal_done(pme_load_balancing_t pme_lb, FILE *fplog)
+void pme_loadbal_done(pme_load_balancing_t pme_lb,
+                      t_commrec *cr, FILE *fplog,
+                      gmx_bool bNonBondedOnGPU)
  {
      if (fplog != NULL && (pme_lb->cur > 0 || pme_lb->elimited != epmelblimNO))
      {
-        print_pme_loadbal_settings(pme_lb, fplog);
+        print_pme_loadbal_settings(pme_lb, cr, fplog, bNonBondedOnGPU);
      }
  
      /* TODO: Here we should free all pointers in pme_lb,