From: Mark Abraham Date: Tue, 9 Feb 2021 06:08:05 +0000 (+0100) Subject: Remove tree-reduce algorithm X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=f6d04f4314a9e65a7b949b9c88693b928ea44d4c;p=alexxy%2Fgromacs.git Remove tree-reduce algorithm This was formerly used only on the now-removed MIC platform. It could well be useful these days with high core counts on conventional CPU platforms, but it is likely that something else will prove limiting to MD step thread scaling before that point. If we ever want it back, then git will remember it, and we'd probably re-implement it on different infrastructure anyway. No need for release notes, as it was not something intended for users to use. Renamed the "standard" reduction function now that there is no need to distinguish it from the tree reduction. Part of #3982 Fixes #3891 --- diff --git a/docs/user-guide/environment-variables.rst b/docs/user-guide/environment-variables.rst index 8d288eed43..e057307e73 100644 --- a/docs/user-guide/environment-variables.rst +++ b/docs/user-guide/environment-variables.rst @@ -396,10 +396,6 @@ Performance and Run Control by mdrun. Values should be between the pruning frequency value (1 for CPU and 2 for GPU) and :mdp:`nstlist` ``- 1``. -``GMX_USE_TREEREDUCE`` - use tree reduction for nbnxn force reduction. Potentially faster for large number of - OpenMP threads (if memory locality is important). - .. _opencl-management: OpenCL management diff --git a/src/gromacs/nbnxm/atomdata.cpp b/src/gromacs/nbnxm/atomdata.cpp index 315b05f3f2..dd672625c2 100644 --- a/src/gromacs/nbnxm/atomdata.cpp +++ b/src/gromacs/nbnxm/atomdata.cpp @@ -45,8 +45,6 @@ #include -#include "thread_mpi/atomic.h" - #include "gromacs/math/functions.h" #include "gromacs/math/utilities.h" #include "gromacs/math/vec.h" @@ -439,8 +437,7 @@ nbnxn_atomdata_t::nbnxn_atomdata_t(gmx::PinningPolicy pinningPolicy) : shift_vec({}, { pinningPolicy }), x_({}, { pinningPolicy }), simdMasks(), - bUseBufferFlags(FALSE), - bUseTreeReduce(FALSE) + bUseBufferFlags(FALSE) { } @@ -688,24 +685,6 @@ void nbnxn_atomdata_init(const gmx::MDLogger& mdlog, } nbat->buffer_flags.clear(); - - const int nth = gmx_omp_nthreads_get(emntNonbonded); - - const char* ptr = getenv("GMX_USE_TREEREDUCE"); - if (ptr != nullptr) - { - nbat->bUseTreeReduce = (strtol(ptr, nullptr, 10) != 0); - } - else - { - nbat->bUseTreeReduce = false; - } - if (nbat->bUseTreeReduce) - { - GMX_LOG(mdlog.info).asParagraph().appendText("Using tree force reduction"); - - nbat->syncStep = new tMPI_Atomic[nth]; - } } template @@ -1261,150 +1240,7 @@ static inline unsigned char reverse_bits(unsigned char b) return (b * 0x0202020202ULL & 0x010884422010ULL) % 1023; } -static void nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbnxn_atomdata_t* nbat, int nth) -{ - gmx::ArrayRef flags = nbat->buffer_flags; - - int next_pow2 = 1 << (gmx::log2I(nth - 1) + 1); - - const int numOutputBuffers = nbat->out.size(); - GMX_ASSERT(numOutputBuffers == nth, - "tree-reduce currently only works for numOutputBuffers==nth"); - - memset(nbat->syncStep, 0, sizeof(*(nbat->syncStep)) * nth); - -#pragma omp parallel num_threads(nth) - { - try - { - int b0, b1, b; - int i0, i1; - int group_size, th; - - th = gmx_omp_get_thread_num(); - - for (group_size = 2; group_size < 2 * next_pow2; group_size *= 2) - { - int index[2], group_pos, partner_pos, wu; - int partner_th = th ^ (group_size / 2); - - if (group_size > 2) - { -#ifdef TMPI_ATOMICS - /* wait on partner thread - replaces full barrier */ - int sync_th, sync_group_size; - -# if defined(__clang__) && __clang_major__ >= 8 - // Suppress warnings that the use of memory_barrier may be excessive - // Only exists beginning with clang-8 -# pragma clang diagnostic push -# pragma clang diagnostic ignored "-Watomic-implicit-seq-cst" -# endif - - tMPI_Atomic_memory_barrier(); /* guarantee data is saved before marking work as done */ - tMPI_Atomic_set(&(nbat->syncStep[th]), group_size / 2); /* mark previous step as completed */ - - /* find thread to sync with. Equal to partner_th unless nth is not a power of two. */ - for (sync_th = partner_th, sync_group_size = group_size; - sync_th >= nth && sync_group_size > 2; - sync_group_size /= 2) - { - sync_th &= ~(sync_group_size / 4); - } - if (sync_th < nth) /* otherwise nothing to sync index[1] will be >=nout */ - { - /* wait on the thread which computed input data in previous step */ - while (tMPI_Atomic_get(static_cast(&(nbat->syncStep[sync_th]))) - < group_size / 2) - { - gmx_pause(); - } - /* guarantee that no later load happens before wait loop is finisehd */ - tMPI_Atomic_memory_barrier(); - } -# if defined(__clang__) && __clang_major__ >= 8 -# pragma clang diagnostic pop -# endif -#else /* TMPI_ATOMICS */ -# pragma omp barrier -#endif - } - - /* Calculate buffers to sum (result goes into first buffer) */ - group_pos = th % group_size; - index[0] = th - group_pos; - index[1] = index[0] + group_size / 2; - - /* If no second buffer, nothing to do */ - if (index[1] >= numOutputBuffers && group_size > 2) - { - continue; - } - -#if NBNXN_BUFFERFLAG_MAX_THREADS > 256 -# error reverse_bits assumes max 256 threads -#endif - /* Position is permuted so that one of the 2 vectors being added was computed on the same thread in the previous step. - This improves locality and enables to sync with just a single thread between steps (=the levels in the btree). - The permutation which allows this corresponds to reversing the bits of the group position. - */ - group_pos = reverse_bits(group_pos) / (256 / group_size); - - partner_pos = group_pos ^ 1; - - /* loop over two work-units (own and partner) */ - for (wu = 0; wu < 2; wu++) - { - if (wu == 1) - { - if (partner_th < nth) - { - break; /* partner exists we don't have to do his work */ - } - else - { - group_pos = partner_pos; - } - } - - /* Calculate the cell-block range for our thread */ - b0 = (flags.size() * group_pos) / group_size; - b1 = (flags.size() * (group_pos + 1)) / group_size; - - for (b = b0; b < b1; b++) - { - i0 = b * NBNXN_BUFFERFLAG_SIZE * nbat->fstride; - i1 = (b + 1) * NBNXN_BUFFERFLAG_SIZE * nbat->fstride; - - if (bitmask_is_set(flags[b], index[1]) || group_size > 2) - { - const real* fIndex1 = nbat->out[index[1]].f.data(); -#if GMX_SIMD - nbnxn_atomdata_reduce_reals_simd -#else - nbnxn_atomdata_reduce_reals -#endif - (nbat->out[index[0]].f.data(), - bitmask_is_set(flags[b], index[0]) || group_size > 2, - &fIndex1, - 1, - i0, - i1); - } - else if (!bitmask_is_set(flags[b], index[0])) - { - nbnxn_atomdata_clear_reals(nbat->out[index[0]].f, i0, i1); - } - } - } - } - } - GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR - } -} - - -static void nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbnxn_atomdata_t* nbat, int nth) +static void nbnxn_atomdata_add_nbat_f_to_f_reduce(nbnxn_atomdata_t* nbat, int nth) { #pragma omp parallel for num_threads(nth) schedule(static) for (int th = 0; th < nth; th++) @@ -1479,14 +1315,7 @@ void reduceForces(nbnxn_atomdata_t* nbat, const gmx::AtomLocality locality, cons /* Reduce the force thread output buffers into buffer 0, before adding * them to the, differently ordered, "real" force buffer. */ - if (nbat->bUseTreeReduce) - { - nbnxn_atomdata_add_nbat_f_to_f_treereduce(nbat, nth); - } - else - { - nbnxn_atomdata_add_nbat_f_to_f_stdreduce(nbat, nth); - } + nbnxn_atomdata_add_nbat_f_to_f_reduce(nbat, nth); } #pragma omp parallel for num_threads(nth) schedule(static) for (int th = 0; th < nth; th++) diff --git a/src/gromacs/nbnxm/atomdata.h b/src/gromacs/nbnxm/atomdata.h index 0a269c280b..9839381ac7 100644 --- a/src/gromacs/nbnxm/atomdata.h +++ b/src/gromacs/nbnxm/atomdata.h @@ -64,7 +64,6 @@ class MDLogger; struct NbnxmGpu; struct nbnxn_atomdata_t; struct nonbonded_verlet_t; -struct tMPI_Atomic; class GpuEventSynchronizer; @@ -297,10 +296,6 @@ public: gmx_bool bUseBufferFlags; //! Flags for buffer zeroing+reduc. std::vector buffer_flags; - //! Use tree for force reduction - gmx_bool bUseTreeReduce; - //! Synchronization step for tree reduce - tMPI_Atomic* syncStep; //! \} };