Move responsibility for bonded threading decomposition

[alexxy/gromacs.git] / src / gromacs / listed-forces / manage-threading.cpp
diff --git a/src/gromacs/listed-forces/manage-threading.cpp b/src/gromacs/listed-forces/manage-threading.cpp

index 197490e675cf00484d552408ea3b3e820a42d4c3..c79f2537cd9997c9f7f3e924ccf4972022c55fdc 100644 (file)
--- a/src/gromacs/listed-forces/manage-threading.cpp
+++ b/src/gromacs/listed-forces/manage-threading.cpp
@@ -66,9 +66,9 @@
  
  /*! \brief struct for passing all data required for a function type */
  typedef struct {
-    int      ftype; /**< the function type index */
-    t_ilist *il;    /**< pointer to t_ilist entry corresponding to ftype */
-    int      nat;   /**< nr of atoms involved in a single ftype interaction */
+    int            ftype; /**< the function type index */
+    const t_ilist *il;    /**< pointer to t_ilist entry corresponding to ftype */
+    int            nat;   /**< nr of atoms involved in a single ftype interaction */
  } ilist_data_t;
  
  /*! \brief Divides listed interactions over threads
@@ -78,10 +78,9 @@ typedef struct {
   * equal load and different threads avoid touching the same atoms as much
   * as possible.
   */
-static void divide_bondeds_by_locality(int                 ntype,
-                                       const ilist_data_t *ild,
-                                       int                 nthread,
-                                       t_idef             *idef)
+static void divide_bondeds_by_locality(bonded_threading_t *bt,
+                                       int                 ntype,
+                                       const ilist_data_t *ild)
  {
      int nat_tot, nat_sum;
      int ind[F_NRE];    /* index into the ild[].il->iatoms */
@@ -103,8 +102,8 @@ static void divide_bondeds_by_locality(int                 ntype,
      }
  
      nat_sum = 0;
-    /* Loop over the end bounds of the nthread threads to determine
-     * which interactions threads 0 to nthread shall calculate.
+    /* Loop over the end bounds of the nthreads threads to determine
+     * which interactions threads 0 to nthreads shall calculate.
       *
       * NOTE: The cost of these combined loops is #interactions*ntype.
       * This code is running single threaded (difficult to parallelize
@@ -114,7 +113,7 @@ static void divide_bondeds_by_locality(int                 ntype,
       * be negligble. At high thread count many other parts of the code
       * scale the same way, so it's (currently) not worth improving this.
       */
-    for (t = 1; t <= nthread; t++)
+    for (t = 1; t <= bt->nthreads; t++)
      {
          int nat_thread;
  
@@ -125,7 +124,7 @@ static void divide_bondeds_by_locality(int                 ntype,
           * uniformly. Proper and RB dihedrals are often distributed
           * non-uniformly, but their cost is roughly equal.
           */
-        nat_thread = (nat_tot*t)/nthread;
+        nat_thread = (nat_tot*t)/bt->nthreads;
  
          while (nat_sum < nat_thread)
          {
@@ -177,7 +176,7 @@ static void divide_bondeds_by_locality(int                 ntype,
          /* Store the bonded end boundaries (at index t) for thread t-1 */
          for (f = 0; f < ntype; f++)
          {
-            idef->il_thread_division[ild[f].ftype*(nthread + 1) + t] = ind[f];
+            bt->il_thread_division[ild[f].ftype*(bt->nthreads + 1) + t] = ind[f];
          }
      }
  
@@ -188,27 +187,23 @@ static void divide_bondeds_by_locality(int                 ntype,
  }
  
  //! Divides bonded interactions over threads
-static void divide_bondeds_over_threads(t_idef *idef,
-                                        int     nthread,
-                                        int     max_nthread_uniform,
-                                        bool   *haveBondeds)
+static void divide_bondeds_over_threads(bonded_threading_t *bt,
+                                        const t_idef       *idef)
  {
      ilist_data_t ild[F_NRE];
      int          ntype;
      int          f;
  
-    assert(nthread > 0);
+    assert(bt->nthreads > 0);
  
-    idef->nthreads = nthread;
-
-    if (F_NRE*(nthread + 1) > idef->il_thread_division_nalloc)
+    if (F_NRE*(bt->nthreads + 1) > bt->il_thread_division_nalloc)
      {
-        idef->il_thread_division_nalloc = F_NRE*(nthread + 1);
-        srenew(idef->il_thread_division, idef->il_thread_division_nalloc);
+        bt->il_thread_division_nalloc = F_NRE*(bt->nthreads + 1);
+        srenew(bt->il_thread_division, bt->il_thread_division_nalloc);
      }
  
-    *haveBondeds = false;
-    ntype        = 0;
+    bt->haveBondeds = false;
+    ntype           = 0;
      for (f = 0; f < F_NRE; f++)
      {
          if (!ftype_is_bonded_potential(f))
@@ -218,19 +213,19 @@ static void divide_bondeds_over_threads(t_idef *idef,
  
          if (idef->il[f].nr > 0)
          {
-            *haveBondeds = true;
+            bt->haveBondeds = true;
          }
  
          if (idef->il[f].nr == 0)
          {
              /* No interactions, avoid all the integer math below */
              int t;
-            for (t = 0; t <= nthread; t++)
+            for (t = 0; t <= bt->nthreads; t++)
              {
-                idef->il_thread_division[f*(nthread + 1) + t] = 0;
+                bt->il_thread_division[f*(bt->nthreads + 1) + t] = 0;
              }
          }
-        else if (nthread <= max_nthread_uniform || f == F_DISRES)
+        else if (bt->nthreads <= bt->max_nthread_uniform || f == F_DISRES)
          {
              /* On up to 4 threads, load balancing the bonded work
               * is more important than minimizing the reduction cost.
@@ -240,12 +235,12 @@ static void divide_bondeds_over_threads(t_idef *idef,
              /* nat1 = 1 + #atoms(ftype) which is the stride use for iatoms */
              nat1 = 1 + NRAL(f);
  
-            for (t = 0; t <= nthread; t++)
+            for (t = 0; t <= bt->nthreads; t++)
              {
                  int nr_t;
  
                  /* Divide equally over the threads */
-                nr_t = (((idef->il[f].nr/nat1)*t)/nthread)*nat1;
+                nr_t = (((idef->il[f].nr/nat1)*t)/bt->nthreads)*nat1;
  
                  if (f == F_DISRES)
                  {
@@ -260,7 +255,7 @@ static void divide_bondeds_over_threads(t_idef *idef,
                      }
                  }
  
-                idef->il_thread_division[f*(nthread + 1) + t] = nr_t;
+                bt->il_thread_division[f*(bt->nthreads + 1) + t] = nr_t;
              }
          }
          else
@@ -274,7 +269,7 @@ static void divide_bondeds_over_threads(t_idef *idef,
              ild[ntype].nat   = nat;
  
              /* The first index for the thread division is always 0 */
-            idef->il_thread_division[f*(nthread + 1)] = 0;
+            bt->il_thread_division[f*(bt->nthreads + 1)] = 0;
  
              ntype++;
          }
@@ -282,7 +277,7 @@ static void divide_bondeds_over_threads(t_idef *idef,
  
      if (ntype > 0)
      {
-        divide_bondeds_by_locality(ntype, ild, nthread, idef);
+        divide_bondeds_by_locality(bt, ntype, ild);
      }
  
      if (debug)
@@ -297,11 +292,11 @@ static void divide_bondeds_over_threads(t_idef *idef,
                  int t;
  
                  fprintf(debug, "%16s", interaction_function[f].name);
-                for (t = 0; t < nthread; t++)
+                for (t = 0; t < bt->nthreads; t++)
                  {
                      fprintf(debug, " %4d",
-                            (idef->il_thread_division[f*(nthread + 1) + t + 1] -
-                             idef->il_thread_division[f*(nthread + 1) + t])/
+                            (bt->il_thread_division[f*(bt->nthreads + 1) + t + 1] -
+                             bt->il_thread_division[f*(bt->nthreads + 1) + t])/
                              (1 + NRAL(f)));
                  }
                  fprintf(debug, "\n");
@@ -312,21 +307,22 @@ static void divide_bondeds_over_threads(t_idef *idef,
  
  //! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
  static void
-calc_bonded_reduction_mask(int natoms,
-                           f_thread_t *f_thread,
-                           const t_idef *idef,
-                           int thread, int nthread)
+calc_bonded_reduction_mask(int                       natoms,
+                           f_thread_t               *f_thread,
+                           const t_idef             *idef,
+                           int                       thread,
+                           const bonded_threading_t &bondedThreading)
  {
      static_assert(BITMASK_SIZE == GMX_OPENMP_MAX_THREADS, "For the error message below we assume these two are equal.");
  
-    if (nthread > BITMASK_SIZE)
+    if (bondedThreading.nthreads > BITMASK_SIZE)
      {
  #pragma omp master
          gmx_fatal(FARGS, "You are using %d OpenMP threads, which is larger than GMX_OPENMP_MAX_THREADS (%d). Decrease the number of OpenMP threads or rebuild GROMACS with a larger value for GMX_OPENMP_MAX_THREADS.",
-                  nthread, GMX_OPENMP_MAX_THREADS);
+                  bondedThreading.nthreads, GMX_OPENMP_MAX_THREADS);
  #pragma omp barrier
      }
-    GMX_ASSERT(nthread <= BITMASK_SIZE, "We need at least nthread bits in the mask");
+    GMX_ASSERT(bondedThreading.nthreads <= BITMASK_SIZE, "We need at least nthreads bits in the mask");
  
      int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
  
@@ -355,8 +351,8 @@ calc_bonded_reduction_mask(int natoms,
              {
                  int nat1 = interaction_function[ftype].nratoms + 1;
  
-                int nb0 = idef->il_thread_division[ftype*(nthread + 1) + thread];
-                int nb1 = idef->il_thread_division[ftype*(nthread + 1) + thread + 1];
+                int nb0 = bondedThreading.il_thread_division[ftype*(bondedThreading.nthreads + 1) + thread];
+                int nb1 = bondedThreading.il_thread_division[ftype*(bondedThreading.nthreads + 1) + thread + 1];
  
                  for (int i = nb0; i < nb1; i += nat1)
                  {
@@ -382,18 +378,16 @@ calc_bonded_reduction_mask(int natoms,
      }
  }
  
-void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
+void setup_bonded_threading(bonded_threading_t *bt,
+                            int                 numAtoms,
+                            const t_idef       *idef)
  {
-    bonded_threading_t *bt   = fr->bonded_threading;
      int                 ctot = 0;
  
      assert(bt->nthreads >= 1);
  
      /* Divide the bonded interaction over the threads */
-    divide_bondeds_over_threads(idef,
-                                bt->nthreads,
-                                bt->bonded_max_nthread_uniform,
-                                &bt->haveBondeds);
+    divide_bondeds_over_threads(bt, idef);
  
      if (!bt->haveBondeds)
      {
@@ -409,8 +403,8 @@ void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
      {
          try
          {
-            calc_bonded_reduction_mask(fr->natoms_force, &bt->f_t[t],
-                                       idef, t, bt->nthreads);
+            calc_bonded_reduction_mask(numAtoms, &bt->f_t[t],
+                                       idef, t, *bt);
          }
          GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
      }
@@ -418,7 +412,7 @@ void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
      /* Reduce the masks over the threads and determine which blocks
       * we need to reduce over.
       */
-    int nblock_tot = (fr->natoms_force + reduction_block_size - 1) >> reduction_block_bits;
+    int nblock_tot = (numAtoms + reduction_block_size - 1) >> reduction_block_bits;
      if (nblock_tot > bt->block_nalloc)
      {
          bt->block_nalloc = over_alloc_large(nblock_tot);
@@ -472,13 +466,12 @@ void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
          fprintf(debug, "Number of %d atom blocks to reduce: %d\n",
                  reduction_block_size, bt->nblock_used);
          fprintf(debug, "Reduction density %.2f for touched blocks only %.2f\n",
-                ctot*reduction_block_size/(double)fr->natoms_force,
+                ctot*reduction_block_size/(double)numAtoms,
                  ctot/(double)bt->nblock_used);
      }
  }
  
-void tear_down_bonded_threading(bonded_threading_t *bt,
-                                t_idef             *idef)
+void tear_down_bonded_threading(bonded_threading_t *bt)
  {
      for (int th = 0; th < bt->nthreads; th++)
      {
@@ -489,8 +482,8 @@ void tear_down_bonded_threading(bonded_threading_t *bt,
          }
      }
      sfree(bt->f_t);
+    sfree(bt->il_thread_division);
      sfree(bt);
-    sfree(idef->il_thread_division);
  }
  
  void init_bonded_threading(FILE *fplog, int nenergrp,
@@ -546,16 +539,16 @@ void init_bonded_threading(FILE *fplog, int nenergrp,
  
      if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != nullptr)
      {
-        sscanf(ptr, "%d", &bt->bonded_max_nthread_uniform);
+        sscanf(ptr, "%d", &bt->max_nthread_uniform);
          if (fplog != nullptr)
          {
              fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
-                    bt->bonded_max_nthread_uniform);
+                    bt->max_nthread_uniform);
          }
      }
      else
      {
-        bt->bonded_max_nthread_uniform = max_nthread_uniform;
+        bt->max_nthread_uniform = max_nthread_uniform;
      }
  
      *bt_ptr = bt;