Merge branch release-5-1 into release-2016

[alexxy/gromacs.git] / src / gromacs / ewald / pme-load-balancing.cpp
diff --git a/src/gromacs/ewald/pme-load-balancing.cpp b/src/gromacs/ewald/pme-load-balancing.cpp

index b7d45657886015fca3a3a80d0402c002e9f7970f..292624fb4d783b2c356701c8b72837d9e267be6d 100644 (file)
--- a/src/gromacs/ewald/pme-load-balancing.cpp
+++ b/src/gromacs/ewald/pme-load-balancing.cpp
@@ -55,17 +55,22 @@
  
  #include "gromacs/domdec/domdec.h"
  #include "gromacs/domdec/domdec_network.h"
-#include "gromacs/legacyheaders/calcgrid.h"
-#include "gromacs/legacyheaders/force.h"
-#include "gromacs/legacyheaders/md_logging.h"
-#include "gromacs/legacyheaders/network.h"
-#include "gromacs/legacyheaders/sim_util.h"
-#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/fft/calcgrid.h"
+#include "gromacs/gmxlib/md_logging.h"
+#include "gromacs/gmxlib/network.h"
+#include "gromacs/math/functions.h"
  #include "gromacs/math/vec.h"
+#include "gromacs/mdlib/forcerec.h"
  #include "gromacs/mdlib/nbnxn_gpu_data_mgmt.h"
+#include "gromacs/mdlib/sim_util.h"
+#include "gromacs/mdtypes/commrec.h"
+#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/md_enums.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
  #include "gromacs/utility/smalloc.h"
  
@@ -75,8 +80,6 @@
  struct pme_setup_t {
      real              rcut_coulomb;    /**< Coulomb cut-off                              */
      real              rlist;           /**< pair-list cut-off                            */
-    real              rlistlong;       /**< LR pair-list cut-off                         */
-    int               nstcalclr;       /**< frequency of evaluating long-range forces for group scheme */
      real              spacing;         /**< (largest) PME grid spacing                   */
      ivec              grid;            /**< the PME grid dimensions                      */
      real              grid_efficiency; /**< ineffiency factor for non-uniform grids <= 1 */
@@ -124,7 +127,6 @@ struct pme_load_balancing_t {
      real         cut_spacing;        /**< the minimum cutoff / PME grid spacing ratio */
      real         rcut_vdw;           /**< Vdw cutoff (does not change) */
      real         rcut_coulomb_start; /**< Initial electrostatics cutoff */
-    int          nstcalclr_start;    /**< Initial electrostatics cutoff */
      real         rbuf_coulomb;       /**< the pairlist buffer size */
      real         rbuf_vdw;           /**< the pairlist buffer size */
      matrix       box_start;          /**< the initial simulation box */
@@ -167,6 +169,12 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
      real                  spm, sp;
      int                   d;
  
+    // Note that we don't (yet) support PME load balancing with LJ-PME only.
+    GMX_RELEASE_ASSERT(EEL_PME(ir->coulombtype), "pme_loadbal_init called without PME electrostatics");
+    // To avoid complexity, we require a single cut-off with PME for q+LJ.
+    // This is checked by grompp, but it doesn't hurt to check again.
+    GMX_RELEASE_ASSERT(!(EEL_PME(ir->coulombtype) && EVDW_PME(ir->vdwtype) && ir->rcoulomb != ir->rvdw), "With Coulomb and LJ PME, rcoulomb should be equal to rvdw");
+
      snew(pme_lb, 1);
  
      pme_lb->bSepPMERanks  = !(cr->duty & DUTY_PME);
@@ -179,30 +187,8 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
  
      pme_lb->cutoff_scheme = ir->cutoff_scheme;
  
-    if (pme_lb->cutoff_scheme == ecutsVERLET)
-    {
-        pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb;
-        pme_lb->rbuf_vdw     = pme_lb->rbuf_coulomb;
-    }
-    else
-    {
-        if (ic->rcoulomb > ic->rlist)
-        {
-            pme_lb->rbuf_coulomb = ic->rlistlong - ic->rcoulomb;
-        }
-        else
-        {
-            pme_lb->rbuf_coulomb = ic->rlist - ic->rcoulomb;
-        }
-        if (ic->rvdw > ic->rlist)
-        {
-            pme_lb->rbuf_vdw = ic->rlistlong - ic->rvdw;
-        }
-        else
-        {
-            pme_lb->rbuf_vdw = ic->rlist - ic->rvdw;
-        }
-    }
+    pme_lb->rbuf_coulomb  = ic->rlist - ic->rcoulomb;
+    pme_lb->rbuf_vdw      = ic->rlist - ic->rvdw;
  
      copy_mat(box, pme_lb->box_start);
      if (ir->ePBC == epbcXY && ir->nwall == 2)
@@ -215,13 +201,10 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
  
      pme_lb->rcut_vdw                 = ic->rvdw;
      pme_lb->rcut_coulomb_start       = ir->rcoulomb;
-    pme_lb->nstcalclr_start          = ir->nstcalclr;
  
      pme_lb->cur                      = 0;
      pme_lb->setup[0].rcut_coulomb    = ic->rcoulomb;
      pme_lb->setup[0].rlist           = ic->rlist;
-    pme_lb->setup[0].rlistlong       = ic->rlistlong;
-    pme_lb->setup[0].nstcalclr       = ir->nstcalclr;
      pme_lb->setup[0].grid[XX]        = ir->nkx;
      pme_lb->setup[0].grid[YY]        = ir->nky;
      pme_lb->setup[0].grid[ZZ]        = ir->nkz;
@@ -261,6 +244,11 @@ void pme_loadbal_init(pme_load_balancing_t     **pme_lb_p,
      pme_lb->cycles_n = 0;
      pme_lb->cycles_c = 0;
  
+    if (!wallcycle_have_counter())
+    {
+        md_print_warn(cr, fp_log, "NOTE: Cycle counters unsupported or not enabled in kernel. Cannot use PME-PP balancing.\n");
+    }
+
      /* Tune with GPUs and/or separate PME ranks.
       * When running only on a CPU without PME ranks, PME tuning will only help
       * with small numbers of atoms in the cut-off sphere.
@@ -367,9 +355,9 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb,
  
      if (pme_lb->cutoff_scheme == ecutsVERLET)
      {
-        set->rlist        = set->rcut_coulomb + pme_lb->rbuf_coulomb;
-        /* We dont use LR lists with Verlet, but this avoids if-statements in further checks */
-        set->rlistlong    = set->rlist;
+        /* Never decrease the Coulomb and VdW list buffers */
+        set->rlist        = std::max(set->rcut_coulomb + pme_lb->rbuf_coulomb,
+                                     pme_lb->rcut_vdw + pme_lb->rbuf_vdw);
      }
      else
      {
@@ -381,39 +369,6 @@ static gmx_bool pme_loadbal_increase_cutoff(pme_load_balancing_t *pme_lb,
           * - We should set fr->bTwinRange = (fr->rlistlong > fr->rlist).
           */
          set->rlist            = std::min(tmpr_coulomb, tmpr_vdw);
-        set->rlistlong        = std::max(tmpr_coulomb, tmpr_vdw);
-
-        /* Set the long-range update frequency */
-        if (set->rlist == set->rlistlong)
-        {
-            /* No long-range interactions if the short-/long-range cutoffs are identical */
-            set->nstcalclr = 0;
-        }
-        else if (pme_lb->nstcalclr_start == 0 || pme_lb->nstcalclr_start == 1)
-        {
-            /* We were not doing long-range before, but now we are since rlist!=rlistlong */
-            set->nstcalclr = 1;
-        }
-        else
-        {
-            /* We were already doing long-range interactions from the start */
-            if (pme_lb->rcut_vdw > pme_lb->rcut_coulomb_start)
-            {
-                /* We were originally doing long-range VdW-only interactions.
-                 * If rvdw is still longer than rcoulomb we keep the original nstcalclr,
-                 * but if the coulomb cutoff has become longer we should update the long-range
-                 * part every step.
-                 */
-                set->nstcalclr = (tmpr_vdw > tmpr_coulomb) ? pme_lb->nstcalclr_start : 1;
-            }
-            else
-            {
-                /* We were not doing any long-range interaction from the start,
-                 * since it is not possible to do twin-range coulomb for the PME interaction.
-                 */
-                set->nstcalclr = 1;
-            }
-        }
      }
  
      set->spacing      = sp;
@@ -465,6 +420,7 @@ static void print_grid(FILE *fp_err, FILE *fp_log,
      if (fp_err != NULL)
      {
          fprintf(fp_err, "\r%s\n", buf);
+        fflush(fp_err);
      }
      if (fp_log != NULL)
      {
@@ -500,6 +456,7 @@ static void print_loadbal_limited(FILE *fp_err, FILE *fp_log,
      if (fp_err != NULL)
      {
          fprintf(fp_err, "\r%s\n", buf);
+        fflush(fp_err);
      }
      if (fp_log != NULL)
      {
@@ -568,7 +525,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
                   t_commrec                 *cr,
                   FILE                      *fp_err,
                   FILE                      *fp_log,
-                 t_inputrec                *ir,
+                 const t_inputrec          *ir,
                   t_state                   *state,
                   double                     cycles,
                   interaction_const_t       *ic,
@@ -591,7 +548,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
      set = &pme_lb->setup[pme_lb->cur];
      set->count++;
  
-    rtab = ir->rlistlong + ir->tabext;
+    rtab = ir->rlist + ir->tabext;
  
      if (set->count % 2 == 1)
      {
@@ -649,7 +606,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
               * better overal performance can be obtained with a slightly
               * shorter cut-off and better DD load balancing.
               */
-            set_dd_dlb_max_cutoff(cr, pme_lb->setup[pme_lb->fastest].rlistlong);
+            set_dd_dlb_max_cutoff(cr, pme_lb->setup[pme_lb->fastest].rlist);
          }
      }
      cycles_fast = pme_lb->setup[pme_lb->fastest].cycles;
@@ -691,7 +648,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
  
              if (OK && ir->ePBC != epbcNONE)
              {
-                OK = (sqr(pme_lb->setup[pme_lb->cur+1].rlistlong)
+                OK = (gmx::square(pme_lb->setup[pme_lb->cur+1].rlist)
                        <= max_cutoff2(ir->ePBC, state->box));
                  if (!OK)
                  {
@@ -706,7 +663,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
                  if (DOMAINDECOMP(cr))
                  {
                      OK = change_dd_cutoff(cr, state, ir,
-                                          pme_lb->setup[pme_lb->cur].rlistlong);
+                                          pme_lb->setup[pme_lb->cur].rlist);
                      if (!OK)
                      {
                          /* Failed: do not use this setup */
@@ -778,7 +735,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
  
      if (DOMAINDECOMP(cr) && pme_lb->stage > 0)
      {
-        OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlistlong);
+        OK = change_dd_cutoff(cr, state, ir, pme_lb->setup[pme_lb->cur].rlist);
          if (!OK)
          {
              /* For some reason the chosen cut-off is incompatible with DD.
@@ -814,14 +771,12 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
  
      ic->rcoulomb     = set->rcut_coulomb;
      ic->rlist        = set->rlist;
-    ic->rlistlong    = set->rlistlong;
-    ir->nstcalclr    = set->nstcalclr;
      ic->ewaldcoeff_q = set->ewaldcoeff_q;
      /* TODO: centralize the code that sets the potentials shifts */
      if (ic->coulomb_modifier == eintmodPOTSHIFT)
      {
          GMX_RELEASE_ASSERT(ic->rcoulomb != 0, "Cutoff radius cannot be zero");
-        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb) / ic->rcoulomb;
+        ic->sh_ewald = std::erfc(ic->ewaldcoeff_q*ic->rcoulomb) / ic->rcoulomb;
      }
      if (EVDW_PME(ic->vdwtype))
      {
@@ -832,11 +787,11 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
          {
              real       crc2;
  
-            ic->dispersion_shift.cpot = -std::pow(static_cast<double>(ic->rvdw), -6.0);
-            ic->repulsion_shift.cpot  = -std::pow(static_cast<double>(ic->rvdw), -12.0);
+            ic->dispersion_shift.cpot = -1.0/gmx::power6(static_cast<double>(ic->rvdw));
+            ic->repulsion_shift.cpot  = -1.0/gmx::power12(static_cast<double>(ic->rvdw));
              ic->sh_invrc6             = -ic->dispersion_shift.cpot;
-            crc2                      = sqr(ic->ewaldcoeff_lj*ic->rvdw);
-            ic->sh_lj_ewald           = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*std::pow(static_cast<double>(ic->rvdw), -6.0);
+            crc2                      = gmx::square(ic->ewaldcoeff_lj*ic->rvdw);
+            ic->sh_lj_ewald           = (std::exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)/gmx::power6(ic->rvdw);
          }
      }
  
@@ -857,7 +812,7 @@ pme_load_balance(pme_load_balancing_t      *pme_lb,
       * texture objects are used), but as this is initialization code, there
       * is not point in complicating things.
       */
-#ifdef GMX_THREAD_MPI
+#if GMX_THREAD_MPI
      if (PAR(cr) && use_GPU(nbv))
      {
          gmx_barrier(cr);
@@ -923,7 +878,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb,
                      t_commrec            *cr,
                      FILE                 *fp_err,
                      FILE                 *fp_log,
-                    t_inputrec           *ir,
+                    const t_inputrec     *ir,
                      t_forcerec           *fr,
                      t_state              *state,
                      gmx_wallcycle_t       wcycle,
@@ -1026,7 +981,7 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb,
               * This also ensures that we won't disable the currently
               * optimal setting during a second round of PME balancing.
               */
-            set_dd_dlb_max_cutoff(cr, fr->ic->rlistlong);
+            set_dd_dlb_max_cutoff(cr, fr->ic->rlist);
          }
      }
  
@@ -1046,7 +1001,6 @@ void pme_loadbal_do(pme_load_balancing_t *pme_lb,
          fr->ewaldcoeff_q  = fr->ic->ewaldcoeff_q;
          fr->ewaldcoeff_lj = fr->ic->ewaldcoeff_lj;
          fr->rlist         = fr->ic->rlist;
-        fr->rlistlong     = fr->ic->rlistlong;
          fr->rcoulomb      = fr->ic->rcoulomb;
          fr->rvdw          = fr->ic->rvdw;
  
@@ -1081,23 +1035,6 @@ static int pme_grid_points(const pme_setup_t *setup)
      return setup->grid[XX]*setup->grid[YY]*setup->grid[ZZ];
  }
  
-/*! \brief Retuern the largest short-range list cut-off radius */
-static real pme_loadbal_rlist(const pme_setup_t *setup)
-{
-    /* With the group cut-off scheme we can have twin-range either
-     * for Coulomb or for VdW, so we need a check here.
-     * With the Verlet cut-off scheme rlist=rlistlong.
-     */
-    if (setup->rcut_coulomb > setup->rlist)
-    {
-        return setup->rlistlong;
-    }
-    else
-    {
-        return setup->rlist;
-    }
-}
-
  /*! \brief Print one load-balancing setting */
  static void print_pme_loadbal_setting(FILE              *fplog,
                                        const char        *name,
@@ -1106,7 +1043,7 @@ static void print_pme_loadbal_setting(FILE              *fplog,
      fprintf(fplog,
              "   %-7s %6.3f nm %6.3f nm     %3d %3d %3d   %5.3f nm  %5.3f nm\n",
              name,
-            setup->rcut_coulomb, pme_loadbal_rlist(setup),
+            setup->rcut_coulomb, setup->rlist,
              setup->grid[XX], setup->grid[YY], setup->grid[ZZ],
              setup->spacing, 1/setup->ewaldcoeff_q);
  }
@@ -1120,8 +1057,8 @@ static void print_pme_loadbal_settings(pme_load_balancing_t *pme_lb,
      double     pp_ratio, grid_ratio;
      real       pp_ratio_temporary;
  
-    pp_ratio_temporary = pme_loadbal_rlist(&pme_lb->setup[pme_lb->cur])/pme_loadbal_rlist(&pme_lb->setup[0]);
-    pp_ratio           = std::pow(static_cast<double>(pp_ratio_temporary), 3.0);
+    pp_ratio_temporary = pme_lb->setup[pme_lb->cur].rlist / pme_lb->setup[0].rlist;
+    pp_ratio           = gmx::power3(pp_ratio_temporary);
      grid_ratio         = pme_grid_points(&pme_lb->setup[pme_lb->cur])/
          (double)pme_grid_points(&pme_lb->setup[0]);