From fdf3b7d136c5638244bd6eadc175a5731682e180 Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Fri, 14 Nov 2014 15:22:05 +0100
Subject: [PATCH] Improve threading of bondeds

To reduce the cost of the force reduction, with more than 4 OpenMP
threads the bondeds are now distributed over the threads using
atom indices instead of uniformly.

Moved the bonded thread data initialization to manage-threading.cpp.

Change-Id: Ica5d6fcfb6db1d85058ef9825d0d515a205ee257
---
 src/gromacs/legacyheaders/types/forcerec.h    |   3 +
 .../listed-forces/manage-threading.cpp        | 318 +++++++++++++++---
 src/gromacs/listed-forces/manage-threading.h  |   5 +-
 src/gromacs/mdlib/forcerec.cpp                |  29 +-
 4 files changed, 288 insertions(+), 67 deletions(-)
diff --git a/src/gromacs/legacyheaders/types/forcerec.h b/src/gromacs/legacyheaders/types/forcerec.h
index f2bd6f0180..33c44a6442 100644
--- a/src/gromacs/legacyheaders/types/forcerec.h
+++ b/src/gromacs/legacyheaders/types/forcerec.h
@@ -475,6 +475,9 @@ typedef struct {
     int         red_nblock;
     f_thread_t *f_t;
 
+    /* Maximum thread count for uniform distribution of bondeds over threads */
+    int   bonded_max_nthread_uniform;
+
     /* Exclusion load distribution over the threads */
     int  *excl_load;
 } t_forcerec;
diff --git a/src/gromacs/listed-forces/manage-threading.cpp b/src/gromacs/listed-forces/manage-threading.cpp
index 01bed0bffa..411be9a682 100644
--- a/src/gromacs/listed-forces/manage-threading.cpp
+++ b/src/gromacs/listed-forces/manage-threading.cpp
@@ -46,66 +46,252 @@
 #include "manage-threading.h"
 
 #include <assert.h>
+#include <limits.h>
 
 #include <algorithm>
 
+#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
 #include "gromacs/listed-forces/listed-forces.h"
 #include "gromacs/mdlib/forcerec-threading.h"
+#include "gromacs/pbcutil/ishift.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/stringutil.h"
 
-//! Divides listed interactions over threads
-static void divide_bondeds_over_threads(t_idef *idef, int nthreads)
+/*! \brief struct for passing all data required for a function type */
+typedef struct {
+    int      ftype; /**< the function type index */
+    t_ilist *il;    /**< pointer to t_ilist entry corresponding to ftype */
+    int      nat;   /**< nr of atoms involved in a single ftype interaction */
+} ilist_data_t;
+
+/*! \brief Divides listed interactions over threads
+ *
+ * This routine attempts to divide all interactions of the ntype bondeds
+ * types stored in ild over the threads such that each thread has roughly
+ * equal load and different threads avoid touching the same atoms as much
+ * as possible.
+ */
+static void divide_bondeds_by_locality(int                 ntype,
+                                       const ilist_data_t *ild,
+                                       int                 nthread,
+                                       t_idef             *idef)
 {
-    int ftype;
-    int nat1;
-    int t;
-    int il_nr_thread;
+    int nat_tot, nat_sum;
+    int ind[F_NRE];    /* index into the ild[].il->iatoms */
+    int at_ind[F_NRE]; /* index of the first atom of the interaction at ind */
+    int f, t;
 
-    idef->nthreads = nthreads;
+    assert(ntype <= F_NRE);
 
-    if (F_NRE*(nthreads+1) > idef->il_thread_division_nalloc)
+    nat_tot = 0;
+    for (f = 0; f < ntype; f++)
     {
-        idef->il_thread_division_nalloc = F_NRE*(nthreads+1);
-        snew(idef->il_thread_division, idef->il_thread_division_nalloc);
+        /* Sum #bondeds*#atoms_per_bond over all bonded types */
+        nat_tot  += ild[f].il->nr/(ild[f].nat + 1)*ild[f].nat;
+        /* The start bound for thread 0 is 0 for all interactions */
+        ind[f]    = 0;
+        /* Initialize the next atom index array */
+        assert(ild[f].il->nr > 0);
+        at_ind[f] = ild[f].il->iatoms[1];
     }
 
-    for (ftype = 0; ftype < F_NRE; ftype++)
+    nat_sum = 0;
+    /* Loop over the end bounds of the nthread threads to determine
+     * which interactions threads 0 to nthread shall calculate.
+     *
+     * NOTE: The cost of these combined loops is #interactions*ntype.
+     * This code is running single threaded (difficult to parallelize
+     * over threads). So the relative cost of this function increases
+     * linearly with the number of threads. Since the inner-most loop
+     * is cheap and this is done only at DD repartitioning, the cost should
+     * be negligble. At high thread count many other parts of the code
+     * scale the same way, so it's (currently) not worth improving this.
+     */
+    for (t = 1; t <= nthread; t++)
     {
-        if (ftype_is_bonded_potential(ftype))
+        int nat_thread;
+
+        /* Here we assume that the computational cost is proportional
+         * to the number of atoms in the interaction. This is a rough
+         * measure, but roughly correct. Usually there are very few
+         * interactions anyhow and there are distributed relatively
+         * uniformly. Proper and RB dihedrals are often distributed
+         * non-uniformly, but their cost is roughly equal.
+         */
+        nat_thread = (nat_tot*t)/nthread;
+
+        while (nat_sum < nat_thread)
         {
-            nat1 = interaction_function[ftype].nratoms + 1;
+            /* To divide bonds based on atom order, we compare
+             * the index of the first atom in the bonded interaction.
+             * This works well, since the domain decomposition generates
+             * bondeds in order of the atoms by looking up interactions
+             * which are linked to the first atom in each interaction.
+             * It usually also works well without DD, since than the atoms
+             * in bonded interactions are usually in increasing order.
+             * If they are not assigned in increasing order, the balancing
+             * is still good, but the memory access and reduction cost will
+             * be higher.
+             */
+            int f_min;
 
-            for (t = 0; t <= nthreads; t++)
+            /* Find out which of the types has the lowest atom index */
+            f_min = 0;
+            for (f = 1; f < ntype; f++)
             {
-                /* Divide the interactions equally over the threads.
-                 * When the different types of bonded interactions
-                 * are distributed roughly equally over the threads,
-                 * this should lead to well localized output into
-                 * the force buffer on each thread.
-                 * If this is not the case, a more advanced scheme
-                 * (not implemented yet) will do better.
-                 */
-                il_nr_thread = (((idef->il[ftype].nr/nat1)*t)/nthreads)*nat1;
-
-                /* Ensure that distance restraint pairs with the same label
-                 * end up on the same thread.
-                 * This is slighlty tricky code, since the next for iteration
-                 * may have an initial il_nr_thread lower than the final value
-                 * in the previous iteration, but this will anyhow be increased
-                 * to the approriate value again by this while loop.
+                if (at_ind[f] < at_ind[f_min])
+                {
+                    f_min = f;
+                }
+            }
+            assert(f_min >= 0 && f_min < ntype);
+
+            /* Assign the interaction with the lowest atom index (of type
+             * index f_min) to thread t-1 by increasing ind.
+             */
+            ind[f_min] += ild[f_min].nat + 1;
+            nat_sum    += ild[f_min].nat;
+
+            /* Update the first unassigned atom index for this type */
+            if (ind[f_min] < ild[f_min].il->nr)
+            {
+                at_ind[f_min] = ild[f_min].il->iatoms[ind[f_min] + 1];
+            }
+            else
+            {
+                /* We have assigned all interactions of this type.
+                 * Setting at_ind to INT_MAX ensures this type will not be
+                 * chosen in the for loop above during next iterations.
                  */
-                while (ftype == F_DISRES &&
-                       il_nr_thread > 0 &&
-                       il_nr_thread < idef->il[ftype].nr &&
-                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread]].disres.label ==
-                       idef->iparams[idef->il[ftype].iatoms[il_nr_thread-nat1]].disres.label)
+                at_ind[f_min] = INT_MAX;
+            }
+        }
+
+        /* Store the bonded end boundaries (at index t) for thread t-1 */
+        for (f = 0; f < ntype; f++)
+        {
+            idef->il_thread_division[ild[f].ftype*(nthread + 1) + t] = ind[f];
+        }
+    }
+
+    for (f = 0; f < ntype; f++)
+    {
+        assert(ind[f] == ild[f].il->nr);
+    }
+}
+
+//! Divides bonded interactions over threads
+static void divide_bondeds_over_threads(t_idef *idef,
+                                        int     nthread,
+                                        int     max_nthread_uniform)
+{
+    ilist_data_t ild[F_NRE];
+    int          ntype;
+    int          f;
+
+    assert(nthread > 0);
+
+    idef->nthreads = nthread;
+
+    if (F_NRE*(nthread + 1) > idef->il_thread_division_nalloc)
+    {
+        idef->il_thread_division_nalloc = F_NRE*(nthread + 1);
+        snew(idef->il_thread_division, idef->il_thread_division_nalloc);
+    }
+
+    ntype = 0;
+    for (f = 0; f < F_NRE; f++)
+    {
+        if (!ftype_is_bonded_potential(f))
+        {
+            continue;
+        }
+
+        if (idef->il[f].nr == 0)
+        {
+            /* No interactions, avoid all the integer math below */
+            int t;
+            for (t = 0; t <= nthread; t++)
+            {
+                idef->il_thread_division[f*(nthread + 1) + t] = 0;
+            }
+        }
+        else if (nthread <= max_nthread_uniform || f == F_DISRES)
+        {
+            /* On up to 4 threads, load balancing the bonded work
+             * is more important than minimizing the reduction cost.
+             */
+            int nat1, t;
+
+            /* nat1 = 1 + #atoms(ftype) which is the stride use for iatoms */
+            nat1 = 1 + NRAL(f);
+
+            for (t = 0; t <= nthread; t++)
+            {
+                int nr_t;
+
+                /* Divide equally over the threads */
+                nr_t = (((idef->il[f].nr/nat1)*t)/nthread)*nat1;
+
+                if (f == F_DISRES)
                 {
-                    il_nr_thread += nat1;
+                    /* Ensure that distance restraint pairs with the same label
+                     * end up on the same thread.
+                     */
+                    while (nr_t > 0 && nr_t < idef->il[f].nr &&
+                           idef->iparams[idef->il[f].iatoms[nr_t]].disres.label ==
+                           idef->iparams[idef->il[f].iatoms[nr_t-nat1]].disres.label)
+                    {
+                        nr_t += nat1;
+                    }
                 }
 
-                idef->il_thread_division[ftype*(nthreads+1)+t] = il_nr_thread;
+                idef->il_thread_division[f*(nthread + 1) + t] = nr_t;
+            }
+        }
+        else
+        {
+            /* Add this ftype to the list to be distributed */
+            int nat;
+
+            nat              = NRAL(f);
+            ild[ntype].ftype = f;
+            ild[ntype].il    = &idef->il[f];
+            ild[ntype].nat   = nat;
+
+            /* The first index for the thread division is always 0 */
+            idef->il_thread_division[f*(nthread + 1)] = 0;
+
+            ntype++;
+        }
+    }
+
+    if (ntype > 0)
+    {
+        divide_bondeds_by_locality(ntype, ild, nthread, idef);
+    }
+
+    if (debug)
+    {
+        int f;
+
+        fprintf(debug, "Division of bondeds over threads:\n");
+        for (f = 0; f < F_NRE; f++)
+        {
+            if (ftype_is_bonded_potential(f) && idef->il[f].nr > 0)
+            {
+                int t;
+
+                fprintf(debug, "%16s", interaction_function[f].name);
+                for (t = 0; t < nthread; t++)
+                {
+                    fprintf(debug, " %4d",
+                            (idef->il_thread_division[f*(nthread + 1) + t + 1] -
+                             idef->il_thread_division[f*(nthread + 1) + t])/
+                            (1 + NRAL(f)));
+                }
+                fprintf(debug, "\n");
             }
         }
     }
@@ -154,7 +340,7 @@ calc_bonded_reduction_mask(gmx_bitmask_t *mask,
  */
 const int maxBlockBits = BITMASK_SIZE;
 
-void setup_bonded_threading(t_forcerec   *fr, t_idef *idef)
+void setup_bonded_threading(t_forcerec *fr, t_idef *idef)
 {
     int t;
     int ctot, c, b;
@@ -162,7 +348,9 @@ void setup_bonded_threading(t_forcerec   *fr, t_idef *idef)
     assert(fr->nthreads >= 1);
 
     /* Divide the bonded interaction over the threads */
-    divide_bondeds_over_threads(idef, fr->nthreads);
+    divide_bondeds_over_threads(idef,
+                                fr->nthreads,
+                                fr->bonded_max_nthread_uniform);
 
     if (fr->nthreads == 1)
     {
@@ -229,3 +417,55 @@ void setup_bonded_threading(t_forcerec   *fr, t_idef *idef)
                 ctot*(1<<fr->red_ashift)/(double)(fr->natoms_force*fr->nthreads));
     }
 }
+
+void init_bonded_threading(FILE *fplog, t_forcerec *fr, int nenergrp)
+{
+    /* These thread local data structures are used for bondeds only */
+    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
+
+    if (fr->nthreads > 1)
+    {
+        int t;
+
+        snew(fr->f_t, fr->nthreads);
+#pragma omp parallel for num_threads(fr->nthreads) schedule(static)
+        for (t = 0; t < fr->nthreads; t++)
+        {
+            /* Thread 0 uses the global force and energy arrays */
+            if (t > 0)
+            {
+                int i;
+
+                fr->f_t[t].f        = NULL;
+                fr->f_t[t].f_nalloc = 0;
+                snew(fr->f_t[t].fshift, SHIFTS);
+                fr->f_t[t].grpp.nener = nenergrp*nenergrp;
+                for (i = 0; i < egNR; i++)
+                {
+                    snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
+                }
+            }
+        }
+
+        /* The optimal value after which to switch from uniform to localized
+         * bonded interaction distribution is 3, 4 or 5 depending on the system
+         * and hardware.
+         */
+        const int max_nthread_uniform = 4;
+        char *    ptr;
+
+        if ((ptr = getenv("GMX_BONDED_NTHREAD_UNIFORM")) != NULL)
+        {
+            sscanf(ptr, "%d", &fr->bonded_max_nthread_uniform);
+            if (fplog != NULL)
+            {
+                fprintf(fplog, "\nMax threads for uniform bonded distribution set to %d by env.var.\n",
+                        fr->bonded_max_nthread_uniform);
+            }
+        }
+        else
+        {
+            fr->bonded_max_nthread_uniform = max_nthread_uniform;
+        }
+    }
+}
diff --git a/src/gromacs/listed-forces/manage-threading.h b/src/gromacs/listed-forces/manage-threading.h
index 1cf0844ce2..2096ec63ae 100644
--- a/src/gromacs/listed-forces/manage-threading.h
+++ b/src/gromacs/listed-forces/manage-threading.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -60,6 +60,9 @@ extern "C" {
  */
 void setup_bonded_threading(t_forcerec *fr, t_idef *idef);
 
+/*! \brief Initialize the bonded threading data structures */
+void init_bonded_threading(FILE *fplog, t_forcerec *fr, int nenergrp);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gromacs/mdlib/forcerec.cpp b/src/gromacs/mdlib/forcerec.cpp
index a34f2dd83c..193894b7cf 100644
--- a/src/gromacs/mdlib/forcerec.cpp
+++ b/src/gromacs/mdlib/forcerec.cpp
@@ -64,6 +64,7 @@
 #include "gromacs/legacyheaders/typedefs.h"
 #include "gromacs/legacyheaders/types/commrec.h"
 #include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
+#include "gromacs/listed-forces/manage-threading.h"
 #include "gromacs/math/calculate-ewald-splitting-coefficient.h"
 #include "gromacs/math/units.h"
 #include "gromacs/math/utilities.h"
@@ -1537,32 +1538,6 @@ gmx_bool can_use_allvsall(const t_inputrec *ir, gmx_bool bPrintNote, t_commrec *
 }
 
 
-static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp)
-{
-    int t, i;
-
-    /* These thread local data structures are used for bondeds only */
-    fr->nthreads = gmx_omp_nthreads_get(emntBonded);
-
-    if (fr->nthreads > 1)
-    {
-        snew(fr->f_t, fr->nthreads);
-        /* Thread 0 uses the global force and energy arrays */
-        for (t = 1; t < fr->nthreads; t++)
-        {
-            fr->f_t[t].f        = NULL;
-            fr->f_t[t].f_nalloc = 0;
-            snew(fr->f_t[t].fshift, SHIFTS);
-            fr->f_t[t].grpp.nener = nenergrp*nenergrp;
-            for (i = 0; i < egNR; i++)
-            {
-                snew(fr->f_t[t].grpp.ener[i], fr->f_t[t].grpp.nener);
-            }
-        }
-    }
-}
-
-
 gmx_bool nbnxn_acceleration_supported(FILE             *fplog,
                                       const t_commrec  *cr,
                                       const t_inputrec *ir,
@@ -3228,7 +3203,7 @@ void init_forcerec(FILE              *fp,
     }
 
     /* Initialize the thread working data for bonded interactions */
-    init_forcerec_f_threads(fr, mtop->groups.grps[egcENER].nr);
+    init_bonded_threading(fp, fr, mtop->groups.grps[egcENER].nr);
 
     snew(fr->excl_load, fr->nthreads+1);
 
-- 
2.22.0