Extract ThreadForceBuffer from listed forces
authorBerk Hess <hess@kth.se>
Thu, 9 Sep 2021 15:08:44 +0000 (15:08 +0000)
committerJoe Jordan <ejjordan12@gmail.com>
Thu, 9 Sep 2021 15:08:44 +0000 (15:08 +0000)
src/gromacs/listed_forces/listed_forces.cpp
src/gromacs/listed_forces/listed_internal.h
src/gromacs/listed_forces/manage_threading.cpp
src/gromacs/mdtypes/CMakeLists.txt
src/gromacs/mdtypes/threaded_force_buffer.cpp [new file with mode: 0644]
src/gromacs/mdtypes/threaded_force_buffer.h [new file with mode: 0644]

index ad01d3fba15b7333f1c8eaee54180a4041b0762e..25900e838d0f102ebb28769000b28915d9d9ecef 100644 (file)
@@ -197,174 +197,6 @@ bool isPairInteraction(int ftype)
     return ((ftype) >= F_LJ14 && (ftype) <= F_LJC_PAIRS_NB);
 }
 
-/*! \brief Zero thread-local output buffers */
-void zero_thread_output(f_thread_t* f_t)
-{
-    constexpr int nelem_fa = sizeof(f_t->f[0]) / sizeof(real);
-
-    for (int i = 0; i < f_t->nblock_used; i++)
-    {
-        int a0 = f_t->block_index[i] * reduction_block_size;
-        int a1 = a0 + reduction_block_size;
-        for (int a = a0; a < a1; a++)
-        {
-            for (int d = 0; d < nelem_fa; d++)
-            {
-                f_t->f[a][d] = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < gmx::c_numShiftVectors; i++)
-    {
-        clear_rvec(f_t->fshift[i]);
-    }
-    for (int i = 0; i < F_NRE; i++)
-    {
-        f_t->ener[i] = 0;
-    }
-    for (int i = 0; i < static_cast<int>(NonBondedEnergyTerms::Count); i++)
-    {
-        for (int j = 0; j < f_t->grpp.nener; j++)
-        {
-            f_t->grpp.energyGroupPairTerms[i][j] = 0;
-        }
-    }
-    for (auto i : keysOf(f_t->dvdl))
-    {
-        f_t->dvdl[i] = 0;
-    }
-}
-
-/*! \brief The max thread number is arbitrary, we used a fixed number
- * to avoid memory management.  Using more than 16 threads is probably
- * never useful performance wise. */
-#define MAX_BONDED_THREADS 256
-
-/*! \brief Reduce thread-local force buffers */
-void reduce_thread_forces(gmx::ArrayRef<gmx::RVec> force, const bonded_threading_t* bt, int nthreads)
-{
-    if (nthreads > MAX_BONDED_THREADS)
-    {
-        gmx_fatal(FARGS, "Can not reduce bonded forces on more than %d threads", MAX_BONDED_THREADS);
-    }
-
-    rvec* gmx_restrict f = as_rvec_array(force.data());
-
-    const int numAtomsForce = bt->numAtomsForce;
-
-    /* This reduction can run on any number of threads,
-     * independently of bt->nthreads.
-     * But if nthreads matches bt->nthreads (which it currently does)
-     * the uniform distribution of the touched blocks over nthreads will
-     * match the distribution of bonded over threads well in most cases,
-     * which means that threads mostly reduce their own data which increases
-     * the number of cache hits.
-     */
-#pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (int b = 0; b < bt->nblock_used; b++)
-    {
-        try
-        {
-            int    ind = bt->block_index[b];
-            rvec4* fp[MAX_BONDED_THREADS];
-
-            /* Determine which threads contribute to this block */
-            int nfb = 0;
-            for (int ft = 0; ft < bt->nthreads; ft++)
-            {
-                if (bitmask_is_set(bt->mask[ind], ft))
-                {
-                    fp[nfb++] = bt->f_t[ft]->f;
-                }
-            }
-            if (nfb > 0)
-            {
-                /* Reduce force buffers for threads that contribute */
-                int a0 = ind * reduction_block_size;
-                int a1 = (ind + 1) * reduction_block_size;
-                /* It would be nice if we could pad f to avoid this min */
-                a1 = std::min(a1, numAtomsForce);
-                for (int a = a0; a < a1; a++)
-                {
-                    for (int fb = 0; fb < nfb; fb++)
-                    {
-                        rvec_inc(f[a], fp[fb][a]);
-                    }
-                }
-            }
-        }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-    }
-}
-
-/*! \brief Reduce thread-local forces, shift forces and energies */
-void reduce_thread_output(gmx::ForceWithShiftForces* forceWithShiftForces,
-                          real*                      ener,
-                          gmx_grppairener_t*         grpp,
-                          gmx::ArrayRef<real>        dvdl,
-                          const bonded_threading_t*  bt,
-                          const gmx::StepWorkload&   stepWork)
-{
-    assert(bt->haveBondeds);
-
-    if (bt->nblock_used > 0)
-    {
-        /* Reduce the bonded force buffer */
-        reduce_thread_forces(forceWithShiftForces->force(), bt, bt->nthreads);
-    }
-
-    rvec* gmx_restrict fshift = as_rvec_array(forceWithShiftForces->shiftForces().data());
-
-    /* When necessary, reduce energy and virial using one thread only */
-    if ((stepWork.computeEnergy || stepWork.computeVirial || stepWork.computeDhdl) && bt->nthreads > 1)
-    {
-        gmx::ArrayRef<const std::unique_ptr<f_thread_t>> f_t = bt->f_t;
-
-        if (stepWork.computeVirial)
-        {
-            for (int i = 0; i < gmx::c_numShiftVectors; i++)
-            {
-                for (int t = 1; t < bt->nthreads; t++)
-                {
-                    rvec_inc(fshift[i], f_t[t]->fshift[i]);
-                }
-            }
-        }
-        if (stepWork.computeEnergy)
-        {
-            for (int i = 0; i < F_NRE; i++)
-            {
-                for (int t = 1; t < bt->nthreads; t++)
-                {
-                    ener[i] += f_t[t]->ener[i];
-                }
-            }
-            for (int i = 0; i < static_cast<int>(NonBondedEnergyTerms::Count); i++)
-            {
-                for (int j = 0; j < f_t[1]->grpp.nener; j++)
-                {
-                    for (int t = 1; t < bt->nthreads; t++)
-                    {
-                        grpp->energyGroupPairTerms[i][j] += f_t[t]->grpp.energyGroupPairTerms[i][j];
-                    }
-                }
-            }
-        }
-        if (stepWork.computeDhdl)
-        {
-            for (auto i : keysOf(f_t[1]->dvdl))
-            {
-
-                for (int t = 1; t < bt->nthreads; t++)
-                {
-                    dvdl[static_cast<int>(i)] += f_t[t]->dvdl[i];
-                }
-            }
-        }
-    }
-}
-
 /*! \brief Returns the bonded kernel flavor
  *
  * Note that energies are always requested when the virial
@@ -557,18 +389,16 @@ static void calcBondedForces(const InteractionDefinitions& idef,
     {
         try
         {
-            f_thread_t& threadBuffers = *bt->f_t[thread];
-            int         ftype;
-            real        v;
+            auto& threadBuffer = bt->threadedForceBuffer.threadForceBuffer(thread);
             /* thread stuff */
             rvec*               fshift;
             gmx::ArrayRef<real> dvdlt;
             gmx::ArrayRef<real> epot;
             gmx_grppairener_t*  grpp;
 
-            zero_thread_output(&threadBuffers);
+            threadBuffer.clearForcesAndEnergies();
 
-            rvec4* ft = threadBuffers.f;
+            rvec4* ft = threadBuffer.forceBuffer();
 
             /* Thread 0 writes directly to the main output buffers.
              * We might want to reconsider this.
@@ -582,37 +412,38 @@ static void calcBondedForces(const InteractionDefinitions& idef,
             }
             else
             {
-                fshift = as_rvec_array(threadBuffers.fshift.data());
-                epot   = threadBuffers.ener;
-                grpp   = &threadBuffers.grpp;
-                dvdlt  = threadBuffers.dvdl;
+                fshift = as_rvec_array(threadBuffer.shiftForces().data());
+                epot   = threadBuffer.energyTerms();
+                grpp   = &threadBuffer.groupPairEnergies();
+                dvdlt  = threadBuffer.dvdl();
             }
             /* Loop over all bonded force types to calculate the bonded forces */
-            for (ftype = 0; (ftype < F_NRE); ftype++)
+            for (int ftype = 0; (ftype < F_NRE); ftype++)
             {
                 const InteractionList& ilist = idef.il[ftype];
                 if (!ilist.empty() && ftype_is_bonded_potential(ftype))
                 {
                     ArrayRef<const int> iatoms = gmx::makeConstArrayRef(ilist.iatoms);
-                    v                          = calc_one_bond(thread,
-                                      ftype,
-                                      idef,
-                                      iatoms,
-                                      idef.numNonperturbedInteractions[ftype],
-                                      bt->workDivision,
-                                      x,
-                                      ft,
-                                      fshift,
-                                      fr,
-                                      pbc_null,
-                                      grpp,
-                                      nrnb,
-                                      lambda,
-                                      dvdlt,
-                                      md,
-                                      fcd,
-                                      stepWork,
-                                      global_atom_index);
+
+                    real v = calc_one_bond(thread,
+                                           ftype,
+                                           idef,
+                                           iatoms,
+                                           idef.numNonperturbedInteractions[ftype],
+                                           bt->workDivision,
+                                           x,
+                                           ft,
+                                           fshift,
+                                           fr,
+                                           pbc_null,
+                                           grpp,
+                                           nrnb,
+                                           lambda,
+                                           dvdlt,
+                                           md,
+                                           fcd,
+                                           stepWork,
+                                           global_atom_index);
                     epot[ftype] += v;
                 }
             }
@@ -683,7 +514,8 @@ void calc_listed(struct gmx_wallcycle*         wcycle,
         wallcycle_sub_stop(wcycle, WallCycleSubCounter::Listed);
 
         wallcycle_sub_start(wcycle, WallCycleSubCounter::ListedBufOps);
-        reduce_thread_output(&forceWithShiftForces, enerd->term.data(), &enerd->grpp, dvdl, bt, stepWork);
+        bt->threadedForceBuffer.reduce(
+                &forceWithShiftForces, enerd->term.data(), &enerd->grpp, dvdl, stepWork, 1);
 
         if (stepWork.computeDhdl)
         {
index 0689ba12634fd96b2299339b48170a5119eb5fcc..cc3e8c30f27360c9f5015e1db6c27ca8f9062f72 100644 (file)
@@ -49,6 +49,7 @@
 
 #include "gromacs/math/vectypes.h"
 #include "gromacs/mdtypes/enerdata.h"
+#include "gromacs/mdtypes/threaded_force_buffer.h"
 #include "gromacs/topology/idef.h"
 #include "gromacs/topology/ifunc.h"
 #include "gromacs/utility/alignedallocator.h"
 #include "gromacs/utility/classhelpers.h"
 #include "gromacs/utility/enumerationhelpers.h"
 
-/* We reduce the force array in blocks of 32 atoms. This is large enough
- * to not cause overhead and 32*sizeof(rvec) is a multiple of the cache-line
- * size on all systems.
- */
-static const int reduction_block_size = 32; /**< Force buffer block size in atoms*/
-static const int reduction_block_bits = 5;  /**< log2(reduction_block_size) */
-
 /*! \internal \brief The division of bonded interactions of the threads */
 class WorkDivision
 {
@@ -92,37 +86,6 @@ private:
     std::vector<int> packedBounds_;
 };
 
-/*! \internal \brief struct with output for bonded forces, used per thread */
-struct f_thread_t
-{
-    //! Constructor
-    f_thread_t(int numEnergyGroups);
-
-    ~f_thread_t() = default;
-
-    //! Force array pointer, equals fBuffer.data(), needed because rvec4 is not a C++ type
-    rvec4* f = nullptr;
-    //! Force array buffer
-    std::vector<real, gmx::AlignedAllocator<real>> fBuffer;
-    //! Mask for marking which parts of f are filled, working array for constructing mask in bonded_threading_t
-    std::vector<gmx_bitmask_t> mask;
-    //! Number of blocks touched by our thread
-    int nblock_used = 0;
-    //! Index to touched blocks
-    std::vector<int> block_index;
-
-    //! Shift force array, size c_numShiftVectors
-    std::vector<gmx::RVec> fshift;
-    //! Energy array
-    real ener[F_NRE];
-    //! Group pair energy data for pairs
-    gmx_grppairener_t grpp;
-    //! Free-energy dV/dl output
-    gmx::EnumerationArray<FreeEnergyPerturbationCouplingType, real> dvdl;
-
-    GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(f_thread_t);
-};
-
 /*! \internal \brief struct contain all data for bonded force threading */
 struct bonded_threading_t
 {
@@ -131,18 +94,10 @@ struct bonded_threading_t
 
     //! Number of threads to be used for bondeds
     int nthreads = 0;
-    //! Force/energy data per thread, size nthreads, stored in unique_ptr to allow thread local allocation
-    std::vector<std::unique_ptr<f_thread_t>> f_t;
-    //! The number of force blocks to reduce
-    int nblock_used = 0;
-    //! Index of size nblock_used into mask
-    std::vector<int> block_index;
-    //! Mask array, one element corresponds to a block of reduction_block_size atoms of the force array, bit corresponding to thread indices set if a thread writes to that block
-    std::vector<gmx_bitmask_t> mask;
+    //! The thread parallel force and energy buffers
+    gmx::ThreadedForceBuffer<rvec4> threadedForceBuffer;
     //! true if we have and thus need to reduce bonded forces
     bool haveBondeds = false;
-    //! The number of atoms forces are computed for
-    int numAtomsForce = 0;
 
     /* There are two different ways to distribute the bonded force calculation
      * over the threads. We decide which to use based on the number of threads.
index 26f9ce89eec7a96e552f0bba75e8731129462830..8b1b68eb2b62d4eb6f27add05d40032524bd4278 100644 (file)
@@ -328,11 +328,11 @@ static void divide_bondeds_over_threads(bonded_threading_t*           bt,
 }
 
 //! Construct a reduction mask for which parts (blocks) of the force array are touched on which thread task
-static void calc_bonded_reduction_mask(int                           natoms,
-                                       f_thread_t*                   f_thread,
-                                       const InteractionDefinitions& idef,
-                                       int                           thread,
-                                       const bonded_threading_t&     bondedThreading)
+static void calc_bonded_reduction_mask(int                            natoms,
+                                       gmx::ThreadForceBuffer<rvec4>* f_thread,
+                                       const InteractionDefinitions&  idef,
+                                       int                            thread,
+                                       const bonded_threading_t&      bondedThreading)
 {
     static_assert(BITMASK_SIZE == GMX_OPENMP_MAX_THREADS,
                   "For the error message below we assume these two are equal.");
@@ -349,20 +349,8 @@ static void calc_bonded_reduction_mask(int                           natoms,
     GMX_ASSERT(bondedThreading.nthreads <= BITMASK_SIZE,
                "We need at least nthreads bits in the mask");
 
-    const int nblock = (natoms + reduction_block_size - 1) >> reduction_block_bits;
 
-    f_thread->mask.resize(nblock);
-    f_thread->block_index.resize(nblock);
-    // NOTE: It seems f_thread->f does not need to be aligned
-    f_thread->fBuffer.resize(nblock * reduction_block_size * sizeof(rvec4) / sizeof(real));
-    f_thread->f = reinterpret_cast<rvec4*>(f_thread->fBuffer.data());
-
-    for (gmx_bitmask_t& mask : f_thread->mask)
-    {
-        bitmask_clear(&mask);
-    }
-
-    gmx::ArrayRef<gmx_bitmask_t> mask = f_thread->mask;
+    f_thread->resizeBufferAndClearMask(natoms);
 
     for (int ftype = 0; ftype < F_NRE; ftype++)
     {
@@ -380,24 +368,14 @@ static void calc_bonded_reduction_mask(int                           natoms,
                 {
                     for (int a = 1; a < nat1; a++)
                     {
-                        bitmask_set_bit(&mask[idef.il[ftype].iatoms[i + a] >> reduction_block_bits], thread);
+                        f_thread->addAtomToMask(idef.il[ftype].iatoms[i + a]);
                     }
                 }
             }
         }
     }
 
-    /* Make an index of the blocks our thread touches, so we can do fast
-     * force buffer clearing.
-     */
-    f_thread->nblock_used = 0;
-    for (int b = 0; b < nblock; b++)
-    {
-        if (bitmask_is_set(mask[b], thread))
-        {
-            f_thread->block_index[f_thread->nblock_used++] = b;
-        }
-    }
+    f_thread->processMask();
 }
 
 void setup_bonded_threading(bonded_threading_t*           bt,
@@ -405,12 +383,8 @@ void setup_bonded_threading(bonded_threading_t*           bt,
                             bool                          useGpuForBondeds,
                             const InteractionDefinitions& idef)
 {
-    int ctot = 0;
-
     assert(bt->nthreads >= 1);
 
-    bt->numAtomsForce = numAtomsForce;
-
     /* Divide the bonded interaction over the threads */
     divide_bondeds_over_threads(bt, useGpuForBondeds, idef);
 
@@ -428,101 +402,29 @@ void setup_bonded_threading(bonded_threading_t*           bt,
     {
         try
         {
-            calc_bonded_reduction_mask(numAtomsForce, bt->f_t[t].get(), idef, t, *bt);
+            calc_bonded_reduction_mask(
+                    numAtomsForce, &bt->threadedForceBuffer.threadForceBuffer(t), idef, t, *bt);
         }
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
     }
 
-    /* Reduce the masks over the threads and determine which blocks
-     * we need to reduce over.
-     */
-    int nblock_tot = (numAtomsForce + reduction_block_size - 1) >> reduction_block_bits;
-    /* Ensure we have sufficient space for all blocks */
-    if (static_cast<size_t>(nblock_tot) > bt->block_index.size())
-    {
-        bt->block_index.resize(nblock_tot);
-    }
-    if (static_cast<size_t>(nblock_tot) > bt->mask.size())
-    {
-        bt->mask.resize(nblock_tot);
-    }
-    bt->nblock_used = 0;
-    for (int b = 0; b < nblock_tot; b++)
-    {
-        gmx_bitmask_t* mask = &bt->mask[b];
-
-        /* Generate the union over the threads of the bitmask */
-        bitmask_clear(mask);
-        for (int t = 0; t < bt->nthreads; t++)
-        {
-            bitmask_union(mask, bt->f_t[t]->mask[b]);
-        }
-        if (!bitmask_is_zero(*mask))
-        {
-            bt->block_index[bt->nblock_used++] = b;
-        }
-
-        if (debug)
-        {
-            int c = 0;
-            for (int t = 0; t < bt->nthreads; t++)
-            {
-                if (bitmask_is_set(*mask, t))
-                {
-                    c++;
-                }
-            }
-            ctot += c;
-
-            if (gmx_debug_at)
-            {
-                fprintf(debug, "block %d flags %s count %d\n", b, to_hex_string(*mask).c_str(), c);
-            }
-        }
-    }
-    if (debug)
-    {
-        fprintf(debug, "Number of %d atom blocks to reduce: %d\n", reduction_block_size, bt->nblock_used);
-        fprintf(debug,
-                "Reduction density %.2f for touched blocks only %.2f\n",
-                ctot * reduction_block_size / static_cast<double>(numAtomsForce),
-                ctot / static_cast<double>(bt->nblock_used));
-    }
-}
-
-f_thread_t::f_thread_t(int numEnergyGroups) : fshift(gmx::c_numShiftVectors), grpp(numEnergyGroups)
-{
+    bt->threadedForceBuffer.setupReduction();
 }
 
 bonded_threading_t::bonded_threading_t(const int numThreads, const int numEnergyGroups, FILE* fplog) :
     nthreads(numThreads),
-    nblock_used(0),
+    threadedForceBuffer(numThreads, numEnergyGroups),
     haveBondeds(false),
     workDivision(nthreads),
     foreignLambdaWorkDivision(1)
 {
-    /* These thread local data structures are used for bondeds only.
-     *
-     * Note that we also use there structures when running single-threaded.
+    /* Note that we also use threadedForceBuffer when running single-threaded.
      * This is because the bonded force buffer uses type rvec4, whereas
      * the normal force buffer is uses type rvec. This leads to a little
      * reduction overhead, but the speed gain in the bonded calculations
      * of doing transposeScatterIncr/DecrU with aligment 4 instead of 3
      * is much larger than the reduction overhead.
      */
-    f_t.resize(numThreads);
-#pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (int t = 0; t < nthreads; t++)
-    {
-        try
-        {
-            /* Note that thread 0 uses the global fshift and energy arrays,
-             * but to keep the code simple, we initialize all data here.
-             */
-            f_t[t] = std::make_unique<f_thread_t>(numEnergyGroups);
-        }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-    }
 
     /* The optimal value after which to switch from uniform to localized
      * bonded interaction distribution is 3, 4 or 5 depending on the system
index bdc90b01615f757d5a410159e24128b891580d66..fde5cfcd5ac56f14b1fcf574b8592f9d3d04d023 100644 (file)
@@ -49,7 +49,8 @@ file(GLOB MDTYPES_SOURCES
     multipletimestepping.cpp
     observableshistory.cpp
     observablesreducer.cpp
-    state.cpp)
+    state.cpp
+    threaded_force_buffer.cpp)
 
 if(GMX_GPU)
     gmx_add_libgromacs_sources(
diff --git a/src/gromacs/mdtypes/threaded_force_buffer.cpp b/src/gromacs/mdtypes/threaded_force_buffer.cpp
new file mode 100644 (file)
index 0000000..c3dc2d9
--- /dev/null
@@ -0,0 +1,376 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file defines the implementation of ThreadForceBuffer and ThreadedForceBuffer.
+ *
+ * \author Berk Hess <hess@kth.se>
+ *
+ * \ingroup module_mdtypes
+ */
+#include "gmxpre.h"
+
+#include "threaded_force_buffer.h"
+
+#include "gromacs/math/vec.h"
+#include "gromacs/mdtypes/forceoutput.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/alignedallocator.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
+
+namespace gmx
+{
+
+/*! \brief The max thread number is arbitrary, we used a fixed number
+ * to avoid memory management.  Using more than 16 threads is probably
+ * never useful performance wise. */
+static constexpr int s_maxNumThreadsForReduction = 256;
+
+template<typename ForceBufferElementType>
+ThreadForceBuffer<ForceBufferElementType>::ThreadForceBuffer(const int threadIndex,
+                                                             const int numEnergyGroups) :
+    threadIndex_(threadIndex), shiftForces_(c_numShiftVectors), groupPairEnergies_(numEnergyGroups)
+{
+}
+
+template<typename ForceBufferElementType>
+void ThreadForceBuffer<ForceBufferElementType>::clearForcesAndEnergies()
+{
+    constexpr int c_numComponents = sizeof(ForceBufferElementType) / sizeof(real);
+
+    for (int atomIndex : usedBlockIndices_)
+    {
+        const int bufferIndexBegin = atomIndex * s_reductionBlockSize * c_numComponents;
+        const int bufferIndexEnd   = bufferIndexBegin + s_reductionBlockSize * c_numComponents;
+        std::fill(forceBuffer_.begin() + bufferIndexBegin, forceBuffer_.begin() + bufferIndexEnd, 0.0_real);
+    }
+
+    const RVec zeroVec = { 0.0_real, 0.0_real, 0.0_real };
+    std::fill(shiftForces_.begin(), shiftForces_.end(), zeroVec);
+
+    std::fill(energyTerms_.begin(), energyTerms_.end(), 0.0_real);
+
+    for (int i = 0; i < static_cast<int>(NonBondedEnergyTerms::Count); i++)
+    {
+        for (int j = 0; j < groupPairEnergies_.nener; j++)
+        {
+            groupPairEnergies_.energyGroupPairTerms[i][j] = 0.0_real;
+        }
+    }
+    for (auto i : keysOf(dvdl_))
+    {
+        dvdl_[i] = 0;
+    }
+}
+
+template<typename ForceBufferElementType>
+void ThreadForceBuffer<ForceBufferElementType>::resizeBufferAndClearMask(const int numAtoms)
+{
+    numAtoms_ = numAtoms;
+
+    const int numBlocks = (numAtoms + s_reductionBlockSize - 1) >> s_numReductionBlockBits;
+
+    reductionMask_.resize(numBlocks);
+    forceBuffer_.resize(numBlocks * s_reductionBlockSize * sizeof(ForceBufferElementType) / sizeof(real));
+
+    for (gmx_bitmask_t& mask : reductionMask_)
+    {
+        bitmask_clear(&mask);
+    }
+}
+
+template<typename ForceBufferElementType>
+void ThreadForceBuffer<ForceBufferElementType>::processMask()
+{
+    // Now we are done setting the masks, generate the new list of used blocks
+    usedBlockIndices_.clear();
+    for (int b = 0; b < ssize(reductionMask_); b++)
+    {
+        if (bitmask_is_set(reductionMask_[b], threadIndex_))
+        {
+            usedBlockIndices_.push_back(b);
+        }
+    }
+}
+
+namespace
+{
+
+//! \brief Reduce thread-local force buffers into \p force (does not reduce shift forces)
+template<typename ForceBufferElementType>
+void reduceThreadForceBuffers(ArrayRef<gmx::RVec> force,
+                              ArrayRef<std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> threadForceBuffers,
+                              ArrayRef<const gmx_bitmask_t> masks,
+                              ArrayRef<const int>           usedBlockIndices)
+{
+    const int numBuffers = threadForceBuffers.size();
+    GMX_ASSERT(numBuffers <= s_maxNumThreadsForReduction,
+               "There is a limit on the number of buffers we can use for reduction");
+
+    const int numAtoms = threadForceBuffers[0]->size();
+
+    rvec* gmx_restrict f = as_rvec_array(force.data());
+
+    /* This reduction can run on any number of threads, independently of the number of buffers.
+     * But if the number of threads matches the number of buffers, which it currently does,
+     * the uniform distribution of the touched blocks over nthreads will
+     * match the distribution of bonded over threads well in most cases,
+     * which means that threads mostly reduce their own data which increases
+     * the number of cache hits.
+     * Additionally, we should always use the same number of threads in parallel
+     * regions in OpenMP, otherwise the performance will degrade significantly.
+     */
+    const int gmx_unused numThreadsForReduction = threadForceBuffers.size();
+#pragma omp parallel for num_threads(numThreadsForReduction) schedule(static)
+    for (int b = 0; b < usedBlockIndices.ssize(); b++)
+    {
+        try
+        {
+            // Reduce the buffers that contribute to this block
+            ForceBufferElementType* fp[s_maxNumThreadsForReduction];
+
+            const int blockIndex = usedBlockIndices[b];
+
+            // Make a list of threads that have this block index set in the mask
+            int numContributingBuffers = 0;
+            for (int ft = 0; ft < numBuffers; ft++)
+            {
+                if (bitmask_is_set(masks[blockIndex], ft))
+                {
+                    fp[numContributingBuffers++] = threadForceBuffers[ft]->forceBuffer();
+                }
+            }
+            if (numContributingBuffers > 0)
+            {
+                // Reduce the selected buffers
+                int a0 = blockIndex * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize;
+                int a1 = (blockIndex + 1) * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize;
+                // Note: It would be nice if we could pad f to avoid this min()
+                a1 = std::min(a1, numAtoms);
+                if (numContributingBuffers == 1)
+                {
+                    // Avoid double loop for the case of a single buffer
+                    for (int a = a0; a < a1; a++)
+                    {
+                        rvec_inc(f[a], fp[0][a]);
+                    }
+                }
+                else
+                {
+                    for (int a = a0; a < a1; a++)
+                    {
+                        for (int fb = 0; fb < numContributingBuffers; fb++)
+                        {
+                            rvec_inc(f[a], fp[fb][a]);
+                        }
+                    }
+                }
+            }
+        }
+        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
+    }
+}
+
+} // namespace
+
+template<typename ForceBufferElementType>
+ThreadedForceBuffer<ForceBufferElementType>::ThreadedForceBuffer(const int numThreads, const int numEnergyGroups)
+{
+    threadForceBuffers_.resize(numThreads);
+#pragma omp parallel for num_threads(numThreads) schedule(static)
+    for (int t = 0; t < numThreads; t++)
+    {
+        try
+        {
+            /* Note that thread 0 uses the global fshift and energy arrays,
+             * but to keep the code simple, we initialize all data here.
+             */
+            threadForceBuffers_[t] =
+                    std::make_unique<ThreadForceBuffer<ForceBufferElementType>>(t, numEnergyGroups);
+        }
+        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
+    }
+}
+
+template<typename ForceBufferElementType>
+void ThreadedForceBuffer<ForceBufferElementType>::setupReduction()
+{
+    const int numBuffers = threadForceBuffers_.size();
+
+    const int numAtoms = threadForceBuffers_[0]->size();
+
+    const int totalNumBlocks =
+            (numAtoms + ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize - 1)
+            >> ThreadForceBuffer<ForceBufferElementType>::s_numReductionBlockBits;
+
+    // Check that all thread buffers have matching sizes
+    for (const auto& threadForceBuffer : threadForceBuffers_)
+    {
+        GMX_RELEASE_ASSERT(threadForceBuffer->size() == numAtoms,
+                           "All buffers should have the same size");
+        GMX_RELEASE_ASSERT(threadForceBuffer->reductionMask().ssize() == totalNumBlocks,
+                           "The block count should match");
+    }
+
+    /* Reduce the masks over the threads and determine which blocks
+     * we need to reduce over.
+     */
+    reductionMask_.resize(totalNumBlocks);
+
+    usedBlockIndices_.clear();
+    int numBlocksUsed = 0;
+    for (int b = 0; b < totalNumBlocks; b++)
+    {
+        gmx_bitmask_t& mask = reductionMask_[b];
+
+        /* Generate the union over the threads of the bitmask */
+        bitmask_clear(&mask);
+        for (int t = 0; t < numBuffers; t++)
+        {
+            bitmask_union(&mask, threadForceBuffers_[t]->reductionMask()[b]);
+        }
+        if (!bitmask_is_zero(mask))
+        {
+            usedBlockIndices_.push_back(b);
+        }
+
+        if (debug)
+        {
+            int c = 0;
+            for (int t = 0; t < numBuffers; t++)
+            {
+                if (bitmask_is_set(mask, t))
+                {
+                    c++;
+                }
+            }
+            numBlocksUsed += c;
+
+            if (gmx_debug_at)
+            {
+                fprintf(debug, "block %d flags %s count %d\n", b, to_hex_string(mask).c_str(), c);
+            }
+        }
+    }
+    if (debug)
+    {
+        fprintf(debug,
+                "Number of %d atom blocks to reduce: %d\n",
+                ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize,
+                int(ssize(usedBlockIndices_)));
+        fprintf(debug,
+                "Reduction density %.2f for touched blocks only %.2f\n",
+                numBlocksUsed * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize
+                        / static_cast<double>(numAtoms),
+                numBlocksUsed / static_cast<double>(ssize(usedBlockIndices_)));
+    }
+}
+
+template<typename ForceBufferElementType>
+void ThreadedForceBuffer<ForceBufferElementType>::reduce(gmx::ForceWithShiftForces* forceWithShiftForces,
+                                                         real*                      ener,
+                                                         gmx_grppairener_t*         grpp,
+                                                         gmx::ArrayRef<real>        dvdl,
+                                                         const gmx::StepWorkload& stepWork,
+                                                         const int reductionBeginIndex)
+{
+    if (!usedBlockIndices_.empty())
+    {
+        /* Reduce the bonded force buffer */
+        reduceThreadForceBuffers<ForceBufferElementType>(
+                forceWithShiftForces->force(), threadForceBuffers_, reductionMask_, usedBlockIndices_);
+    }
+
+    rvec* gmx_restrict fshift = as_rvec_array(forceWithShiftForces->shiftForces().data());
+
+    const int numBuffers = numThreadBuffers();
+
+    /* When necessary, reduce energy and virial using one thread only */
+    if ((stepWork.computeEnergy || stepWork.computeVirial || stepWork.computeDhdl)
+        && numBuffers > reductionBeginIndex)
+    {
+        gmx::ArrayRef<const std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> f_t =
+                threadForceBuffers_;
+
+        if (stepWork.computeVirial)
+        {
+            for (int i = 0; i < gmx::c_numShiftVectors; i++)
+            {
+                for (int t = reductionBeginIndex; t < numBuffers; t++)
+                {
+                    rvec_inc(fshift[i], f_t[t]->shiftForces()[i]);
+                }
+            }
+        }
+        if (stepWork.computeEnergy)
+        {
+            for (int i = 0; i < F_NRE; i++)
+            {
+                for (int t = reductionBeginIndex; t < numBuffers; t++)
+                {
+                    ener[i] += f_t[t]->energyTerms()[i];
+                }
+            }
+            for (int i = 0; i < static_cast<int>(NonBondedEnergyTerms::Count); i++)
+            {
+                for (int j = 0; j < f_t[0]->groupPairEnergies().nener; j++)
+                {
+                    for (int t = reductionBeginIndex; t < numBuffers; t++)
+                    {
+                        grpp->energyGroupPairTerms[i][j] +=
+                                f_t[t]->groupPairEnergies().energyGroupPairTerms[i][j];
+                    }
+                }
+            }
+        }
+        if (stepWork.computeDhdl)
+        {
+            for (auto i : keysOf(f_t[0]->dvdl()))
+            {
+
+                for (int t = reductionBeginIndex; t < numBuffers; t++)
+                {
+                    dvdl[static_cast<int>(i)] += f_t[t]->dvdl()[i];
+                }
+            }
+        }
+    }
+}
+
+template class ThreadForceBuffer<rvec4>;
+template class ThreadedForceBuffer<rvec4>;
+
+} // namespace gmx
diff --git a/src/gromacs/mdtypes/threaded_force_buffer.h b/src/gromacs/mdtypes/threaded_force_buffer.h
new file mode 100644 (file)
index 0000000..de8eca1
--- /dev/null
@@ -0,0 +1,220 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief This file contains the declaration of ThreadForceBuffer and ThreadedForceBuffer.
+ *
+ * These classes provides thread-local force, shift force and energy buffers
+ * for kernels. These kernels can then run completely independently on
+ * multiple threads. Their output can be reduced thread-parallel afterwards.
+ *
+ * Usage:
+ *
+ * At domain decomposition time:
+ * Each thread calls: ThreadForceBuffer.resizeBufferAndClearMask()
+ * Each thread calls: ThreadForceBuffer.addAtomToMask() for all atoms used in the buffer
+ * Each thread calls: ThreadForceBuffer.processMask()
+ * After that ThreadedForceBuffer.setupReduction() is called
+ *
+ * At force computation time:
+ * Each thread calls: ThreadForceBuffer.clearForcesAndEnergies().
+ * Each thread can then accumulate forces and energies into the buffers in ThreadForceBuffer.
+ * After that ThreadedForceBuffer.reduce() is called for thread-parallel reduction.
+ *
+ * \author Berk Hess <hess@kth.se>
+ * \ingroup mdtypes
+ */
+#ifndef GMX_MDTYPES_THREADED_FORCE_BUFFER_H
+#define GMX_MDTYPES_THREADED_FORCE_BUFFER_H
+
+#include <memory>
+
+#include "gromacs/math/vectypes.h"
+#include "gromacs/mdtypes/enerdata.h"
+#include "gromacs/mdtypes/simulation_workload.h"
+#include "gromacs/topology/idef.h"
+#include "gromacs/topology/ifunc.h"
+#include "gromacs/utility/alignedallocator.h"
+#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/bitmask.h"
+#include "gromacs/utility/classhelpers.h"
+
+namespace gmx
+{
+
+class ForceWithShiftForces;
+class StepWorkload;
+
+/*! \internal
+ * \brief Object that holds force and energies buffers plus a mask for a thread
+ *
+ * \tparam ForceBufferElementType  The type for components of the normal force buffer: rvec or rvec4
+ */
+template<typename ForceBufferElementType>
+class ThreadForceBuffer
+{
+public:
+    /* We reduce the force array in blocks of 2^5 atoms. This is large enough
+     * to not cause overhead and 32*sizeof(rvec) is a multiple of the cache-line
+     * size on all systems.
+     */
+    //! The log2 of the reduction block size
+    static constexpr int s_numReductionBlockBits = 5;
+    //! Force buffer block size in atoms
+    static constexpr int s_reductionBlockSize = (1 << s_numReductionBlockBits);
+
+    //! Constructor
+    ThreadForceBuffer(int threadIndex, int numEnergyGroups);
+
+    //! Resizes the buffer to \p numAtoms and clears the mask
+    void resizeBufferAndClearMask(int numAtoms);
+
+    //! Adds atom with index \p atomIndex for reduction
+    void addAtomToMask(const int atomIndex)
+    {
+        bitmask_set_bit(&reductionMask_[atomIndex >> s_numReductionBlockBits], threadIndex_);
+    }
+
+    void processMask();
+
+    //! Returns the size of the force buffer in number of atoms
+    index size() const { return numAtoms_; }
+
+    //! Clears all force and energy buffers
+    void clearForcesAndEnergies();
+
+    //! Returns a plain pointer to the force buffer
+    ForceBufferElementType* forceBuffer()
+    {
+        return reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data());
+    }
+
+    //! Returns a view of the shift force buffer
+    ArrayRef<RVec> shiftForces() { return shiftForces_; }
+
+    //! Returns a view of the energy terms, size F_NRE
+    ArrayRef<real> energyTerms() { return energyTerms_; }
+
+    //! Returns a reference to the energy group pair energies
+    gmx_grppairener_t& groupPairEnergies() { return groupPairEnergies_; }
+
+    //! Returns a reference to the dvdl terms
+    EnumerationArray<FreeEnergyPerturbationCouplingType, real>& dvdl() { return dvdl_; }
+
+    //! Returns a const view to the reduction masks
+    ArrayRef<const gmx_bitmask_t> reductionMask() const { return reductionMask_; }
+
+private:
+    //! Force array buffer
+    std::vector<real, AlignedAllocator<real>> forceBuffer_;
+    //! Mask for marking which parts of f are filled, working array for constructing mask in bonded_threading_t
+    std::vector<gmx_bitmask_t> reductionMask_;
+    //! Index to touched blocks
+    std::vector<int> usedBlockIndices_;
+    //! The index of our thread
+    int threadIndex_;
+    //! The number of atoms in the buffer
+    int numAtoms_ = 0;
+
+    //! Shift force array, size c_numShiftVectors
+    std::vector<RVec> shiftForces_;
+    //! Energy array
+    std::array<real, F_NRE> energyTerms_;
+    //! Group pair energy data for pairs
+    gmx_grppairener_t groupPairEnergies_;
+    //! Free-energy dV/dl output
+    gmx::EnumerationArray<FreeEnergyPerturbationCouplingType, real> dvdl_;
+
+    // Disallow copy and assign, remove this we we get rid of f_
+    GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(ThreadForceBuffer);
+};
+
+/*! \internal
+ * \brief Class for accumulating and reducing forces and energies on threads in parallel
+ *
+ * \tparam ForceBufferElementType  The type for components of the normal force buffer: rvec or rvec4
+ */
+template<typename ForceBufferElementType>
+class ThreadedForceBuffer
+{
+public:
+    //! Constructor
+    ThreadedForceBuffer(int numThreads, int numEnergyGroups);
+
+    //! Returns the number of thread buffers
+    int numThreadBuffers() const { return threadForceBuffers_.size(); }
+
+    //! Returns a reference to the buffer object for the thread with index \p bufferIndex
+    ThreadForceBuffer<ForceBufferElementType>& threadForceBuffer(int bufferIndex)
+    {
+        return *threadForceBuffers_[bufferIndex];
+    }
+
+    //! Sets up the reduction, should be called after generating the masks on each thread
+    void setupReduction();
+
+    /*! Reduces forces and energies, as requested by \p stepWork
+     *
+     * The reduction of all output starts at the output from thread \p reductionBeginIndex,
+     * except for the normal force buffer, which always starts at 0.
+     */
+    void reduce(gmx::ForceWithShiftForces* forceWithShiftForces,
+                real*                      ener,
+                gmx_grppairener_t*         grpp,
+                gmx::ArrayRef<real>        dvdl,
+                const gmx::StepWorkload&   stepWork,
+                int                        reductionBeginIndex);
+
+private:
+    //! Force/energy data per thread, size nthreads, stored in unique_ptr to allow thread local allocation
+    std::vector<std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> threadForceBuffers_;
+    //! Indices of blocks that are used, i.e. have force contributions.
+    std::vector<int> usedBlockIndices_;
+    //! Mask array, one element corresponds to a block of reduction_block_size atoms of the force array, bit corresponding to thread indices set if a thread writes to that block
+    std::vector<gmx_bitmask_t> reductionMask_;
+    //! The number of atoms forces are computed for
+    int numAtomsForce_ = 0;
+
+    // Disallow copies to avoid sub-optimal ownership of allocated memory
+    GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(ThreadedForceBuffer);
+};
+
+// Instantiate for rvec4. Can also be instantiated for rvec.
+extern template class ThreadForceBuffer<rvec4>;
+extern template class ThreadedForceBuffer<rvec4>;
+
+} // namespace gmx
+
+#endif