From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Tue, 9 Feb 2021 06:08:05 +0000 (+0100)
Subject: Remove tree-reduce algorithm
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=f6d04f4314a9e65a7b949b9c88693b928ea44d4c;p=alexxy%2Fgromacs.git

Remove tree-reduce algorithm

This was formerly used only on the now-removed MIC platform. It could
well be useful these days with high core counts on conventional CPU
platforms, but it is likely that something else will prove limiting to
MD step thread scaling before that point. If we ever want it back,
then git will remember it, and we'd probably re-implement it on
different infrastructure anyway.

No need for release notes, as it was not something intended for users
to use.

Renamed the "standard" reduction function now that there is no need to
distinguish it from the tree reduction.

Part of #3982
Fixes #3891
---

diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst
index 8d288eed43..e057307e73 100644
--- a/docs/user-guide/environment-variables.rst
+++ b/docs/user-guide/environment-variables.rst
@@ -396,10 +396,6 @@ Performance and Run Control
         by mdrun. Values should be between the pruning frequency value
         (1 for CPU and 2 for GPU) and :mdp:`nstlist` ``- 1``.
 
-``GMX_USE_TREEREDUCE``
-        use tree reduction for nbnxn force reduction. Potentially faster for large number of
-        OpenMP threads (if memory locality is important).
-
 .. _opencl-management:
 
 OpenCL management
diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp
index 315b05f3f2..dd672625c2 100644
--- a/src/gromacs/nbnxm/atomdata.cpp
+++ b/src/gromacs/nbnxm/atomdata.cpp
@@ -45,8 +45,6 @@
 
 #include <algorithm>
 
-#include "thread_mpi/atomic.h"
-
 #include "gromacs/math/functions.h"
 #include "gromacs/math/utilities.h"
 #include "gromacs/math/vec.h"
@@ -439,8 +437,7 @@ nbnxn_atomdata_t::nbnxn_atomdata_t(gmx::PinningPolicy pinningPolicy) :
     shift_vec({}, { pinningPolicy }),
     x_({}, { pinningPolicy }),
     simdMasks(),
-    bUseBufferFlags(FALSE),
-    bUseTreeReduce(FALSE)
+    bUseBufferFlags(FALSE)
 {
 }
 
@@ -688,24 +685,6 @@ void nbnxn_atomdata_init(const gmx::MDLogger&    mdlog,
     }
 
     nbat->buffer_flags.clear();
-
-    const int nth = gmx_omp_nthreads_get(emntNonbonded);
-
-    const char* ptr = getenv("GMX_USE_TREEREDUCE");
-    if (ptr != nullptr)
-    {
-        nbat->bUseTreeReduce = (strtol(ptr, nullptr, 10) != 0);
-    }
-    else
-    {
-        nbat->bUseTreeReduce = false;
-    }
-    if (nbat->bUseTreeReduce)
-    {
-        GMX_LOG(mdlog.info).asParagraph().appendText("Using tree force reduction");
-
-        nbat->syncStep = new tMPI_Atomic[nth];
-    }
 }
 
 template<int packSize>
@@ -1261,150 +1240,7 @@ static inline unsigned char reverse_bits(unsigned char b)
     return (b * 0x0202020202ULL & 0x010884422010ULL) % 1023;
 }
 
-static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbnxn_atomdata_t* nbat, int nth)
-{
-    gmx::ArrayRef<const gmx_bitmask_t> flags = nbat->buffer_flags;
-
-    int next_pow2 = 1 << (gmx::log2I(nth - 1) + 1);
-
-    const int numOutputBuffers = nbat->out.size();
-    GMX_ASSERT(numOutputBuffers == nth,
-               "tree-reduce currently only works for numOutputBuffers==nth");
-
-    memset(nbat->syncStep, 0, sizeof(*(nbat->syncStep)) * nth);
-
-#pragma omp parallel num_threads(nth)
-    {
-        try
-        {
-            int b0, b1, b;
-            int i0, i1;
-            int group_size, th;
-
-            th = gmx_omp_get_thread_num();
-
-            for (group_size = 2; group_size < 2 * next_pow2; group_size *= 2)
-            {
-                int index[2], group_pos, partner_pos, wu;
-                int partner_th = th ^ (group_size / 2);
-
-                if (group_size > 2)
-                {
-#ifdef TMPI_ATOMICS
-                    /* wait on partner thread - replaces full barrier */
-                    int sync_th, sync_group_size;
-
-#    if defined(__clang__) && __clang_major__ >= 8
-                    // Suppress warnings that the use of memory_barrier may be excessive
-                    // Only exists beginning with clang-8
-#        pragma clang diagnostic push
-#        pragma clang diagnostic ignored "-Watomic-implicit-seq-cst"
-#    endif
-
-                    tMPI_Atomic_memory_barrier(); /* guarantee data is saved before marking work as done */
-                    tMPI_Atomic_set(&(nbat->syncStep[th]), group_size / 2); /* mark previous step as completed */
-
-                    /* find thread to sync with. Equal to partner_th unless nth is not a power of two. */
-                    for (sync_th = partner_th, sync_group_size = group_size;
-                         sync_th >= nth && sync_group_size > 2;
-                         sync_group_size /= 2)
-                    {
-                        sync_th &= ~(sync_group_size / 4);
-                    }
-                    if (sync_th < nth) /* otherwise nothing to sync index[1] will be >=nout */
-                    {
-                        /* wait on the thread which computed input data in previous step */
-                        while (tMPI_Atomic_get(static_cast<volatile tMPI_Atomic_t*>(&(nbat->syncStep[sync_th])))
-                               < group_size / 2)
-                        {
-                            gmx_pause();
-                        }
-                        /* guarantee that no later load happens before wait loop is finisehd */
-                        tMPI_Atomic_memory_barrier();
-                    }
-#    if defined(__clang__) && __clang_major__ >= 8
-#        pragma clang diagnostic pop
-#    endif
-#else /* TMPI_ATOMICS */
-#    pragma omp barrier
-#endif
-                }
-
-                /* Calculate buffers to sum (result goes into first buffer) */
-                group_pos = th % group_size;
-                index[0]  = th - group_pos;
-                index[1]  = index[0] + group_size / 2;
-
-                /* If no second buffer, nothing to do */
-                if (index[1] >= numOutputBuffers && group_size > 2)
-                {
-                    continue;
-                }
-
-#if NBNXN_BUFFERFLAG_MAX_THREADS > 256
-#    error reverse_bits assumes max 256 threads
-#endif
-                /* Position is permuted so that one of the 2 vectors being added was computed on the same thread in the previous step.
-                   This improves locality and enables to sync with just a single thread between steps (=the levels in the btree).
-                   The permutation which allows this corresponds to reversing the bits of the group position.
-                 */
-                group_pos = reverse_bits(group_pos) / (256 / group_size);
-
-                partner_pos = group_pos ^ 1;
-
-                /* loop over two work-units (own and partner) */
-                for (wu = 0; wu < 2; wu++)
-                {
-                    if (wu == 1)
-                    {
-                        if (partner_th < nth)
-                        {
-                            break; /* partner exists we don't have to do his work */
-                        }
-                        else
-                        {
-                            group_pos = partner_pos;
-                        }
-                    }
-
-                    /* Calculate the cell-block range for our thread */
-                    b0 = (flags.size() * group_pos) / group_size;
-                    b1 = (flags.size() * (group_pos + 1)) / group_size;
-
-                    for (b = b0; b < b1; b++)
-                    {
-                        i0 = b * NBNXN_BUFFERFLAG_SIZE * nbat->fstride;
-                        i1 = (b + 1) * NBNXN_BUFFERFLAG_SIZE * nbat->fstride;
-
-                        if (bitmask_is_set(flags[b], index[1]) || group_size > 2)
-                        {
-                            const real* fIndex1 = nbat->out[index[1]].f.data();
-#if GMX_SIMD
-                            nbnxn_atomdata_reduce_reals_simd
-#else
-                            nbnxn_atomdata_reduce_reals
-#endif
-                                    (nbat->out[index[0]].f.data(),
-                                     bitmask_is_set(flags[b], index[0]) || group_size > 2,
-                                     &fIndex1,
-                                     1,
-                                     i0,
-                                     i1);
-                        }
-                        else if (!bitmask_is_set(flags[b], index[0]))
-                        {
-                            nbnxn_atomdata_clear_reals(nbat->out[index[0]].f, i0, i1);
-                        }
-                    }
-                }
-            }
-        }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-    }
-}
-
-
-static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t* nbat, int nth)
+static void nbnxn_atomdata_add_nbat_f_to_f_reduce(nbnxn_atomdata_t* nbat, int nth)
 {
 #pragma omp parallel for num_threads(nth) schedule(static)
     for (int th = 0; th < nth; th++)
@@ -1479,14 +1315,7 @@ void reduceForces(nbnxn_atomdata_t* nbat, const gmx::AtomLocality locality, cons
         /* Reduce the force thread output buffers into buffer 0, before adding
          * them to the, differently ordered, "real" force buffer.
          */
-        if (nbat->bUseTreeReduce)
-        {
-            nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth);
-        }
-        else
-        {
-            nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth);
-        }
+        nbnxn_atomdata_add_nbat_f_to_f_reduce(nbat, nth);
     }
 #pragma omp parallel for num_threads(nth) schedule(static)
     for (int th = 0; th < nth; th++)
diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h
index 0a269c280b..9839381ac7 100644
--- a/src/gromacs/nbnxm/atomdata.h
+++ b/src/gromacs/nbnxm/atomdata.h
@@ -64,7 +64,6 @@ class MDLogger;
 struct NbnxmGpu;
 struct nbnxn_atomdata_t;
 struct nonbonded_verlet_t;
-struct tMPI_Atomic;
 
 class GpuEventSynchronizer;
 
@@ -297,10 +296,6 @@ public:
     gmx_bool bUseBufferFlags;
     //! Flags for buffer zeroing+reduc.
     std::vector<gmx_bitmask_t> buffer_flags;
-    //! Use tree for force reduction
-    gmx_bool bUseTreeReduce;
-    //! Synchronization step for tree reduce
-    tMPI_Atomic* syncStep;
     //! \}
 };