src/gromacs/mdtypes/threaded_force_buffer.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief This file contains the declaration of ThreadForceBuffer and ThreadedForceBuffer.
  38  *
  39  * These classes provides thread-local force, shift force and energy buffers
  40  * for kernels. These kernels can then run completely independently on
  41  * multiple threads. Their output can be reduced thread-parallel afterwards.
  42  *
  43  * Usage:
  44  *
  45  * At domain decomposition time:
  46  * Each thread calls: ThreadForceBuffer.resizeBufferAndClearMask()
  47  * Each thread calls: ThreadForceBuffer.addAtomToMask() for all atoms used in the buffer
  48  * Each thread calls: ThreadForceBuffer.processMask()
  49  * After that ThreadedForceBuffer.setupReduction() is called
  50  *
  51  * At force computation time:
  52  * Each thread calls: ThreadForceBuffer.clearForcesAndEnergies().
  53  * Each thread can then accumulate forces and energies into the buffers in ThreadForceBuffer.
  54  * After that ThreadedForceBuffer.reduce() is called for thread-parallel reduction.
  55  *
  56  * \author Berk Hess <hess@kth.se>
  57  * \ingroup mdtypes
  58  */
  59 #ifndef GMX_MDTYPES_THREADED_FORCE_BUFFER_H
  60 #define GMX_MDTYPES_THREADED_FORCE_BUFFER_H
  61
  62 #include <memory>
  63
  64 #include "gromacs/math/arrayrefwithpadding.h"
  65 #include "gromacs/math/vectypes.h"
  66 #include "gromacs/mdtypes/enerdata.h"
  67 #include "gromacs/mdtypes/simulation_workload.h"
  68 #include "gromacs/topology/ifunc.h"
  69 #include "gromacs/utility/alignedallocator.h"
  70 #include "gromacs/utility/arrayref.h"
  71 #include "gromacs/utility/bitmask.h"
  72 #include "gromacs/utility/classhelpers.h"
  73
  74 namespace gmx
  75 {
  76
  77 class ForceWithShiftForces;
  78 class StepWorkload;
  79
  80 /*! \internal
  81  * \brief Object that holds force and energies buffers plus a mask for a thread
  82  *
  83  * \tparam ForceBufferElementType  The type for components of the normal force buffer: rvec or rvec4
  84  */
  85 template<typename ForceBufferElementType>
  86 class ThreadForceBuffer
  87 {
  88 public:
  89     /* We reduce the force array in blocks of 2^5 atoms. This is large enough
  90      * to not cause overhead and 32*sizeof(rvec) is a multiple of the cache-line
  91      * size on all systems.
  92      */
  93     //! The log2 of the reduction block size
  94     static constexpr int s_numReductionBlockBits = 5;
  95     //! Force buffer block size in atoms
  96     static constexpr int s_reductionBlockSize = (1 << s_numReductionBlockBits);
  97
  98     /*! \brief Constructor
  99      * \param[in] threadIndex  The index of the thread that will fill the buffers in this object
 100      * \param[in] useEnergyTerms   Whether the list of energy terms will be used
 101      * \param[in] numEnergyGroups  The number of non-bonded energy groups
 102      */
 103     ThreadForceBuffer(int threadIndex, bool useEnergyTerms, int numEnergyGroups);
 104
 105     //! Resizes the buffer to \p numAtoms and clears the mask
 106     void resizeBufferAndClearMask(int numAtoms);
 107
 108     //! Adds atom with index \p atomIndex for reduction
 109     void addAtomToMask(const int atomIndex)
 110     {
 111         bitmask_set_bit(&reductionMask_[atomIndex >> s_numReductionBlockBits], threadIndex_);
 112     }
 113
 114     void processMask();
 115
 116     //! Returns the size of the force buffer in number of atoms
 117     index size() const { return numAtoms_; }
 118
 119     //! Clears all force and energy buffers
 120     void clearForcesAndEnergies();
 121
 122     //! Returns an array reference to the force buffer which is aligned for SIMD access
 123     ArrayRef<ForceBufferElementType> forceBuffer()
 124     {
 125         return ArrayRef<ForceBufferElementType>(
 126                 reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data()),
 127                 reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data()) + numAtoms_);
 128     }
 129
 130     /*! \brief Returns an array reference with padding to the force buffer which is aligned for SIMD access
 131      *
 132      * For RVec there is padding of one real for 4-wide SIMD access.
 133      * For both RVec and rvec4 there is padding up to the block size for use in ThreadedForceBuffer.
 134      */
 135     ArrayRefWithPadding<ForceBufferElementType> forceBufferWithPadding()
 136     {
 137         return ArrayRefWithPadding<ForceBufferElementType>(
 138                 reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data()),
 139                 reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data()) + numAtoms_,
 140                 reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data() + forceBuffer_.size()));
 141     }
 142
 143     //! Returns a view of the shift force buffer
 144     ArrayRef<RVec> shiftForces() { return shiftForces_; }
 145
 146     //! Returns a view of the energy terms, size F_NRE
 147     ArrayRef<real> energyTerms() { return energyTerms_; }
 148
 149     //! Returns a reference to the energy group pair energies
 150     gmx_grppairener_t& groupPairEnergies() { return groupPairEnergies_; }
 151
 152     //! Returns a reference to the dvdl terms
 153     EnumerationArray<FreeEnergyPerturbationCouplingType, real>& dvdl() { return dvdl_; }
 154
 155     //! Returns a const view to the reduction masks
 156     ArrayRef<const gmx_bitmask_t> reductionMask() const { return reductionMask_; }
 157
 158 private:
 159     //! Force array buffer, aligned to enable aligned SIMD access
 160     std::vector<real, AlignedAllocator<real>> forceBuffer_;
 161     //! Mask for marking which parts of f are filled, working array for constructing mask in setupReduction()
 162     std::vector<gmx_bitmask_t> reductionMask_;
 163     //! Index to touched blocks
 164     std::vector<int> usedBlockIndices_;
 165     //! The index of our thread
 166     int threadIndex_;
 167     //! The number of atoms in the buffer
 168     int numAtoms_ = 0;
 169
 170     //! Shift force array, size c_numShiftVectors
 171     std::vector<RVec> shiftForces_;
 172     //! Energy array, can be empty
 173     std::vector<real> energyTerms_;
 174     //! Group pair energy data for pairs
 175     gmx_grppairener_t groupPairEnergies_;
 176     //! Free-energy dV/dl output
 177     gmx::EnumerationArray<FreeEnergyPerturbationCouplingType, real> dvdl_;
 178
 179     // Disallow copy and assign, remove this we we get rid of f_
 180     GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(ThreadForceBuffer);
 181 };
 182
 183 /*! \internal
 184  * \brief Class for accumulating and reducing forces and energies on threads in parallel
 185  *
 186  * \tparam ForceBufferElementType  The type for components of the normal force buffer: rvec or rvec4
 187  */
 188 template<typename ForceBufferElementType>
 189 class ThreadedForceBuffer
 190 {
 191 public:
 192     /*! \brief Constructor
 193      * \param[in] numThreads       The number of threads that will use the buffers and reduce
 194      * \param[in] useEnergyTerms   Whether the list of energy terms will be used
 195      * \param[in] numEnergyGroups  The number of non-bonded energy groups
 196      */
 197     ThreadedForceBuffer(int numThreads, bool useEnergyTerms, int numEnergyGroups);
 198
 199     //! Returns the number of thread buffers
 200     int numThreadBuffers() const { return threadForceBuffers_.size(); }
 201
 202     //! Returns a reference to the buffer object for the thread with index \p bufferIndex
 203     ThreadForceBuffer<ForceBufferElementType>& threadForceBuffer(int bufferIndex)
 204     {
 205         return *threadForceBuffers_[bufferIndex];
 206     }
 207
 208     //! Sets up the reduction, should be called after generating the masks on each thread
 209     void setupReduction();
 210
 211     /*! \brief Reduces forces and energies, as requested by \p stepWork
 212      *
 213      * The reduction of all output starts at the output from thread \p reductionBeginIndex,
 214      * except for the normal force buffer, which always starts at 0.
 215      *
 216      * Buffers that will not be used as indicated by the flags in \p stepWork
 217      * are allowed to be nullptr or empty.
 218      */
 219     void reduce(gmx::ForceWithShiftForces* forceWithShiftForces,
 220                 real*                      ener,
 221                 gmx_grppairener_t*         grpp,
 222                 gmx::ArrayRef<real>        dvdl,
 223                 const gmx::StepWorkload&   stepWork,
 224                 int                        reductionBeginIndex);
 225
 226 private:
 227     //! Whether the energy buffer is used
 228     bool useEnergyTerms_;
 229     //! Force/energy data per thread, size nthreads, stored in unique_ptr to allow thread local allocation
 230     std::vector<std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> threadForceBuffers_;
 231     //! Indices of blocks that are used, i.e. have force contributions.
 232     std::vector<int> usedBlockIndices_;
 233     //! Mask array, one element corresponds to a block of reduction_block_size atoms of the force array, bit corresponding to thread indices set if a thread writes to that block
 234     std::vector<gmx_bitmask_t> reductionMask_;
 235     //! The number of atoms forces are computed for
 236     int numAtomsForce_ = 0;
 237
 238     // Disallow copies to avoid sub-optimal ownership of allocated memory
 239     GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(ThreadedForceBuffer);
 240 };
 241
 242 // Instantiate for RVec
 243 extern template class ThreadForceBuffer<RVec>;
 244 extern template class ThreadedForceBuffer<RVec>;
 245
 246 // Instantiate for rvec4
 247 extern template class ThreadForceBuffer<rvec4>;
 248 extern template class ThreadedForceBuffer<rvec4>;
 249
 250 } // namespace gmx
 251
 252 #endif