Add nbnxn tree force reduction

author Roland Schulz <roland@utk.edu>

Mon, 2 Dec 2013 15:07:55 +0000 (10:07 -0500)

committer Gerrit Code Review <gerrit@gerrit.gromacs.org>

Wed, 15 Jan 2014 22:25:46 +0000 (23:25 +0100)
author Roland Schulz <roland@utk.edu>
Mon, 2 Dec 2013 15:07:55 +0000 (10:07 -0500)
committer Gerrit Code Review <gerrit@gerrit.gromacs.org>
Wed, 15 Jan 2014 22:25:46 +0000 (23:25 +0100)
diff --git a/manual/install.tex b/manual/install.tex

index ff46b68f45615e3255f0d96bbf36e5cf0a593e70..a6c4857ed7964762954b73510d4c4fff4b495471 100644 (file)
--- a/manual/install.tex
+++ b/manual/install.tex
@@ -1,7 +1,7 @@
  %
  % This file is part of the GROMACS molecular simulation package.
  %
-% Copyright (c) 2013, by the GROMACS development team, led by
+% Copyright (c) 2013,2014, by the GROMACS development team, led by
  % Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  % and including many others, as listed in the AUTHORS file in the
  % top-level source directory and at http://www.gromacs.org.
@@ -341,6 +341,8 @@ you should consult your local documentation for details.
  \item   {\tt MDRUN}: the {\tt \normindex{mdrun}} command used by {\tt \normindex{g_tune_pme}}.
  \item   {\tt GMX_NSTLIST}: sets the default value for {\tt nstlist}, preventing it from being tuned during
          {\tt \normindex{mdrun}} startup when using the Verlet cutoff scheme.
+\item   {\tt GMX_USE_TREEREDUCE}: use tree reduction for nbnxn force reduction. Potentially faster for large number of 
+        OpenMP threads (if memory locality is important).
  
  \end{enumerate}
  
diff --git a/src/gromacs/legacyheaders/types/nbnxn_pairlist.h b/src/gromacs/legacyheaders/types/nbnxn_pairlist.h

index c8e7cdfe549cdda784de20fd37929d2c515fd70c..38f7b3d9274b5ec2eef554910daa6b053feea74e 100644 (file)
--- a/src/gromacs/legacyheaders/types/nbnxn_pairlist.h
+++ b/src/gromacs/legacyheaders/types/nbnxn_pairlist.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -40,6 +40,8 @@
  #  include <config.h>
  #endif
  
+#include "thread_mpi/atomic.h"
+
  #ifdef __cplusplus
  extern "C" {
  #endif
@@ -270,6 +272,8 @@ typedef struct {
      int                      nalloc;                 /* Allocation size of all arrays (for x/f *x/fstride) */
      gmx_bool                 bUseBufferFlags;        /* Use the flags or operate on all atoms     */
      nbnxn_buffer_flags_t     buffer_flags;           /* Flags for buffer zeroing+reduc.  */
+    gmx_bool                 bUseTreeReduce;         /* Use tree for force reduction */
+    tMPI_Atomic_t           *syncStep;               /* Synchronization step for tree reduce */
  } nbnxn_atomdata_t;
  
  #ifdef __cplusplus
diff --git a/src/gromacs/math/utilities.c b/src/gromacs/math/utilities.c

index d5459ca6d4bb7bc417fae75c8ca66c90e59ebe56..ec2b9d4e60a09cb1ab0b5056a2dcfcd7681510ad 100644 (file)
--- a/src/gromacs/math/utilities.c
+++ b/src/gromacs/math/utilities.c
@@ -45,6 +45,7 @@
  #ifdef HAVE__FINITE
  #include <float.h>
  #endif
+#include <assert.h>
  
  int gmx_nint(real a)
  {
@@ -764,12 +765,45 @@ gmx_numzero(double a)
      return gmx_within_tol(a, 0.0, GMX_REAL_MIN/GMX_REAL_EPS);
  }
  
-real
-gmx_log2(real x)
+unsigned int
+gmx_log2i(unsigned int n)
  {
-    const real iclog2 = 1.0/log( 2.0 );
+    assert(n != 0); /* behavior differs for 0 */
+#if defined(__INTEL_COMPILER)
+    return _bit_scan_reverse(n);
+#elif defined(__GNUC__) && UINT_MAX == 4294967295U /*also for clang*/
+    return __builtin_clz(n) ^ 31U;                 /* xor gets optimized out */
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+    {
+        unsigned long i;
+        _BitScanReverse(&i, n);
+        return i;
+    }
+#elif defined(__xlC__)
+    return 31 - __cntlz4(n);
+#else
+    /* http://graphics.stanford.edu/~seander/bithacks.html#IntegerLogLookup */
+#define LT(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n
+    static const char     LogTable256[256] = {
+        -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+        LT(4), LT(5), LT(5), LT(6), LT(6), LT(6), LT(6),
+        LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7), LT(7)
+    };
+#undef LT
+
+    unsigned int r;     /* r will be lg(n) */
+    unsigned int t, tt; /* temporaries */
  
-    return log( x ) * iclog2;
+    if ((tt = n >> 16) != 0)
+    {
+        r = ((t = tt >> 8) != 0) ? 24 + LogTable256[t] : 16 + LogTable256[tt];
+    }
+    else
+    {
+        r = ((t = n >> 8) != 0) ? 8 + LogTable256[t] : LogTable256[n];
+    }
+    return r;
+#endif
  }
  
  gmx_bool
diff --git a/src/gromacs/math/utilities.h b/src/gromacs/math/utilities.h

index 4a7187cd0919ab7bcda354c8c52f2eed56238fa3..aac50d10b66c00fc95265ceb30a0f1d749df06c6 100644 (file)
--- a/src/gromacs/math/utilities.h
+++ b/src/gromacs/math/utilities.h
@@ -38,6 +38,7 @@
  #define GMX_MATH_UTILITIES_H
  
  #include "../legacyheaders/types/simple.h"
+#include <limits.h>
  
  #ifdef __cplusplus
  extern "C" {
@@ -134,12 +135,12 @@ gmx_within_tol(double   f1,
  int
  gmx_numzero(double a);
  
-/*! \brief Compute logarithm to base 2
+/*! \brief Compute floor of logarithm to base 2
   *
   * \return log2(x)
   */
-real
-gmx_log2(real x);
+unsigned int
+gmx_log2i(unsigned int x);
  
  /*! /brief Multiply two large ints
   *
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.c b/src/gromacs/mdlib/nbnxn_atomdata.c

index e1729b6c522f91c6da05657fb573757fa80f4377..6ab0da6d50715eb884410d69cb8513532b10c4bc 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_atomdata.c
+++ b/src/gromacs/mdlib/nbnxn_atomdata.c
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -39,12 +39,14 @@
  
  #include <math.h>
  #include <string.h>
+#include <assert.h>
  #include "smalloc.h"
  #include "macros.h"
  #include "vec.h"
  #include "nbnxn_consts.h"
  #include "nbnxn_internal.h"
  #include "nbnxn_search.h"
+#include "gromacs/utility/gmxomp.h"
  #include "gmx_omp_nthreads.h"
  
  /* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
@@ -520,7 +522,7 @@ void nbnxn_atomdata_init(FILE *fp,
                           nbnxn_alloc_t *alloc,
                           nbnxn_free_t  *free)
  {
-    int      i, j;
+    int      i, j, nth;
      real     c6, c12, tol;
      char    *ptr;
      gmx_bool simple, bCombGeom, bCombLB;
@@ -761,6 +763,32 @@ void nbnxn_atomdata_init(FILE *fp,
      }
      nbat->buffer_flags.flag        = NULL;
      nbat->buffer_flags.flag_nalloc = 0;
+
+    nth = gmx_omp_nthreads_get(emntNonbonded);
+
+    ptr = getenv("GMX_USE_TREEREDUCE");
+    if (ptr != NULL)
+    {
+        nbat->bUseTreeReduce = strtol(ptr, 0, 10);
+    }
+#if defined __MIC__
+    else if (nth > 8) /*on the CPU we currently don't benefit even at 32*/
+    {
+        nbat->bUseTreeReduce = 1;
+    }
+#endif
+    else
+    {
+        nbat->bUseTreeReduce = 0;
+    }
+    if (nbat->bUseTreeReduce)
+    {
+        if (fp)
+        {
+            fprintf(fp, "Using tree force reduction\n\n");
+        }
+        snew(nbat->syncStep, nth);
+    }
  }
  
  static void copy_lj_to_nbat_lj_comb_x4(const real *ljparam_type,
@@ -1273,6 +1301,189 @@ nbnxn_atomdata_add_nbat_f_to_f_part(const nbnxn_search_t nbs,
      }
  }
  
+static gmx_inline unsigned char reverse_bits(unsigned char b)
+{
+    /* http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64BitsDiv */
+    return (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;
+}
+
+static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(const nbnxn_atomdata_t *nbat,
+                                                      int                     nth)
+{
+    const nbnxn_buffer_flags_t *flags = &nbat->buffer_flags;
+
+    int next_pow2 = 1<<(gmx_log2i(nth-1)+1);
+
+    assert(nbat->nout == nth); /* tree-reduce currently only works for nout==nth */
+
+    memset(nbat->syncStep, 0, sizeof(*(nbat->syncStep))*nth);
+
+#pragma omp parallel num_threads(nth)
+    {
+        int   b0, b1, b;
+        int   i0, i1;
+        int   group_size, th;
+
+        th = gmx_omp_get_thread_num();
+
+        for (group_size = 2; group_size < 2*next_pow2; group_size *= 2)
+        {
+            int index[2], group_pos, partner_pos, wu;
+            int partner_th = th ^ (group_size/2);
+
+            if (group_size > 2)
+            {
+#ifdef TMPI_ATOMICS
+                /* wait on partner thread - replaces full barrier */
+                int sync_th, sync_group_size;
+
+                tMPI_Atomic_memory_barrier();                         /* gurantee data is saved before marking work as done */
+                tMPI_Atomic_set(&(nbat->syncStep[th]), group_size/2); /* mark previous step as completed */
+
+                /* find thread to sync with. Equal to partner_th unless nth is not a power of two. */
+                for (sync_th = partner_th, sync_group_size = group_size; sync_th >= nth && sync_group_size > 2; sync_group_size /= 2)
+                {
+                    sync_th &= ~(sync_group_size/4);
+                }
+                if (sync_th < nth) /* otherwise nothing to sync index[1] will be >=nout */
+                {
+                    /* wait on the thread which computed input data in previous step */
+                    while (tMPI_Atomic_get((volatile tMPI_Atomic_t*)&(nbat->syncStep[sync_th])) < group_size/2)
+                    {
+                        gmx_pause();
+                    }
+                    /* guarantee that no later load happens before wait loop is finisehd */
+                    tMPI_Atomic_memory_barrier();
+                }
+#else           /* TMPI_ATOMICS */
+#pragma omp barrier
+#endif
+            }
+
+            /* Calculate buffers to sum (result goes into first buffer) */
+            group_pos = th % group_size;
+            index[0]  = th - group_pos;
+            index[1]  = index[0] + group_size/2;
+
+            /* If no second buffer, nothing to do */
+            if (index[1] >= nbat->nout && group_size > 2)
+            {
+                continue;
+            }
+
+#if NBNXN_BUFFERFLAG_MAX_THREADS > 256
+#error reverse_bits assumes max 256 threads
+#endif
+            /* Position is permuted so that one of the 2 vectors being added was computed on the same thread in the previous step.
+               This improves locality and enables to sync with just a single thread between steps (=the levels in the btree).
+               The permutation which allows this corresponds to reversing the bits of the group position.
+             */
+            group_pos = reverse_bits(group_pos)/(256/group_size);
+
+            partner_pos = group_pos ^ 1;
+
+            /* loop over two work-units (own and partner) */
+            for (wu = 0; wu < 2; wu++)
+            {
+                if (wu == 1)
+                {
+                    if (partner_th < nth)
+                    {
+                        break; /* partner exists we don't have to do his work */
+                    }
+                    else
+                    {
+                        group_pos = partner_pos;
+                    }
+                }
+
+                /* Calculate the cell-block range for our thread */
+                b0 = (flags->nflag* group_pos   )/group_size;
+                b1 = (flags->nflag*(group_pos+1))/group_size;
+
+                for (b = b0; b < b1; b++)
+                {
+                    i0 =  b   *NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
+                    i1 = (b+1)*NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
+
+                    if ((flags->flag[b] & (1ULL<<index[1])) || group_size > 2)
+                    {
+#ifdef GMX_NBNXN_SIMD
+                        nbnxn_atomdata_reduce_reals_simd
+#else
+                        nbnxn_atomdata_reduce_reals
+#endif
+                            (nbat->out[index[0]].f,
+                            (flags->flag[b] & (1ULL<<index[0])) || group_size > 2,
+                            &(nbat->out[index[1]].f), 1, i0, i1);
+
+                    }
+                    else if (!(flags->flag[b] & (1ULL<<index[0])))
+                    {
+                        nbnxn_atomdata_clear_reals(nbat->out[index[0]].f,
+                                                   i0, i1);
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(const nbnxn_atomdata_t *nbat,
+                                                     int                     nth)
+{
+    int th;
+#pragma omp parallel for num_threads(nth) schedule(static)
+    for (th = 0; th < nth; th++)
+    {
+        const nbnxn_buffer_flags_t *flags;
+        int   b0, b1, b;
+        int   i0, i1;
+        int   nfptr;
+        real *fptr[NBNXN_BUFFERFLAG_MAX_THREADS];
+        int   out;
+
+        flags = &nbat->buffer_flags;
+
+        /* Calculate the cell-block range for our thread */
+        b0 = (flags->nflag* th   )/nth;
+        b1 = (flags->nflag*(th+1))/nth;
+
+        for (b = b0; b < b1; b++)
+        {
+            i0 =  b   *NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
+            i1 = (b+1)*NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
+
+            nfptr = 0;
+            for (out = 1; out < nbat->nout; out++)
+            {
+                if (flags->flag[b] & (1U<<out))
+                {
+                    fptr[nfptr++] = nbat->out[out].f;
+                }
+            }
+            if (nfptr > 0)
+            {
+#ifdef GMX_NBNXN_SIMD
+                nbnxn_atomdata_reduce_reals_simd
+#else
+                nbnxn_atomdata_reduce_reals
+#endif
+                    (nbat->out[0].f,
+                    flags->flag[b] & (1U<<0),
+                    fptr, nfptr,
+                    i0, i1);
+            }
+            else if (!(flags->flag[b] & (1U<<0)))
+            {
+                nbnxn_atomdata_clear_reals(nbat->out[0].f,
+                                           i0, i1);
+            }
+        }
+    }
+}
+
  /* Add the force array(s) from nbnxn_atomdata_t to f */
  void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t    nbs,
                                      int                     locality,
@@ -1312,56 +1523,15 @@ void nbnxn_atomdata_add_nbat_f_to_f(const nbnxn_search_t    nbs,
          /* Reduce the force thread output buffers into buffer 0, before adding
           * them to the, differently ordered, "real" force buffer.
           */
-#pragma omp parallel for num_threads(nth) schedule(static)
-        for (th = 0; th < nth; th++)
+        if (nbat->bUseTreeReduce)
          {
-            const nbnxn_buffer_flags_t *flags;
-            int   b0, b1, b;
-            int   i0, i1;
-            int   nfptr;
-            real *fptr[NBNXN_BUFFERFLAG_MAX_THREADS];
-            int   out;
-
-            flags = &nbat->buffer_flags;
-
-            /* Calculate the cell-block range for our thread */
-            b0 = (flags->nflag* th   )/nth;
-            b1 = (flags->nflag*(th+1))/nth;
-
-            for (b = b0; b < b1; b++)
-            {
-                i0 =  b   *NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
-                i1 = (b+1)*NBNXN_BUFFERFLAG_SIZE*nbat->fstride;
-
-                nfptr = 0;
-                for (out = 1; out < nbat->nout; out++)
-                {
-                    if (flags->flag[b] & (1U<<out))
-                    {
-                        fptr[nfptr++] = nbat->out[out].f;
-                    }
-                }
-                if (nfptr > 0)
-                {
-#ifdef GMX_NBNXN_SIMD
-                    nbnxn_atomdata_reduce_reals_simd
-#else
-                    nbnxn_atomdata_reduce_reals
-#endif
-                        (nbat->out[0].f,
-                        flags->flag[b] & (1U<<0),
-                        fptr, nfptr,
-                        i0, i1);
-                }
-                else if (!(flags->flag[b] & (1U<<0)))
-                {
-                    nbnxn_atomdata_clear_reals(nbat->out[0].f,
-                                               i0, i1);
-                }
-            }
+            nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth);
+        }
+        else
+        {
+            nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth);
          }
      }
-
  #pragma omp parallel for num_threads(nth) schedule(static)
      for (th = 0; th < nth; th++)
      {
diff --git a/src/gromacs/utility/gmxomp.h b/src/gromacs/utility/gmxomp.h

index 15b4c644cc3c2ae90dad2f3417d48a2dc93b5249..178eaac4bf3ee2a8912373eb9aafb0d6f44d084f 100644 (file)
--- a/src/gromacs/utility/gmxomp.h
+++ b/src/gromacs/utility/gmxomp.h
@@ -49,6 +49,14 @@
  #ifndef GMX_UTILITY_OMP_H
  #define GMX_UTILITY_OMP_H
  
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef GMX_X86_SSE2
+#include <xmmintrin.h>
+#endif
+
  #include "types/commrec.h"
  #include "mdrun.h"
  
@@ -99,6 +107,21 @@ void gmx_omp_set_num_threads(int num_threads);
  void gmx_omp_check_thread_affinity(FILE *fplog, const t_commrec *cr,
                                     gmx_hw_opt_t *hw_opt);
  
+/*! \brief
+ * Pause for use in a spin-wait loop.
+ */
+static gmx_inline void gmx_pause()
+{
+    /* Replace with tbb::internal::atomic_backoff when/if we use TBB */
+#if defined GMX_X86_SSE2
+    _mm_pause();
+#elif defined __MIC__
+    _mm_delay_32(32);
+#else
+    /* No wait for unknown architecture */
+#endif
+}
+
  /*! \} */
  
  #ifdef __cplusplus
author	Roland Schulz <roland@utk.edu>
	Mon, 2 Dec 2013 15:07:55 +0000 (10:07 -0500)
committer	Gerrit Code Review <gerrit@gerrit.gromacs.org>
	Wed, 15 Jan 2014 22:25:46 +0000 (23:25 +0100)
manual/install.tex		patch \| blob \| history
src/gromacs/legacyheaders/types/nbnxn_pairlist.h		patch \| blob \| history
src/gromacs/math/utilities.c		patch \| blob \| history
src/gromacs/math/utilities.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_atomdata.c		patch \| blob \| history
src/gromacs/utility/gmxomp.h		patch \| blob \| history