Improve pair search thread load balance

author Berk Hess <hess@kth.se>

Wed, 10 Jun 2015 08:07:35 +0000 (10:07 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Sat, 27 Jun 2015 22:40:39 +0000 (00:40 +0200)
author Berk Hess <hess@kth.se>
Wed, 10 Jun 2015 08:07:35 +0000 (10:07 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Sat, 27 Jun 2015 22:40:39 +0000 (00:40 +0200)
diff --git a/src/gromacs/mdlib/nbnxn_search.c b/src/gromacs/mdlib/nbnxn_search.c

index 7e278a94ee814f077610956989ae9eb36e5fc703..6ec71ccd2a2e38e5fa25c82c7dcc08794a94fbb1 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.c
+++ b/src/gromacs/mdlib/nbnxn_search.c
@@ -4818,11 +4818,24 @@ static int get_ci_block_size(const nbnxn_grid_t *gridi,
      /* Without domain decomposition
       * or with less than 3 blocks per task, divide in nth blocks.
       */
-    if (!bDomDec || ci_block*3*nth > gridi->nc)
+    if (!bDomDec || nth*3*ci_block > gridi->nc)
      {
          ci_block = (gridi->nc + nth - 1)/nth;
      }
  
+    if (ci_block > 1 && (nth - 1)*ci_block >= gridi->nc)
+    {
+        /* Some threads have no work. Although reducing the block size
+         * does not decrease the block count on the first few threads,
+         * with GPUs better mixing of "upper" cells that have more empty
+         * clusters results in a somewhat lower max load over all threads.
+         * Without GPUs the regime of so few atoms per thread is less
+         * performance relevant, but with 8-wide SIMD the same reasoning
+         * applies, since the pair list uses 4 i-atom "sub-clusters".
+         */
+        ci_block--;
+    }
+
      return ci_block;
  }
author	Berk Hess <hess@kth.se>
	Wed, 10 Jun 2015 08:07:35 +0000 (10:07 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Sat, 27 Jun 2015 22:40:39 +0000 (00:40 +0200)