2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file contains the declaration of ThreadForceBuffer and ThreadedForceBuffer.
39 * These classes provides thread-local force, shift force and energy buffers
40 * for kernels. These kernels can then run completely independently on
41 * multiple threads. Their output can be reduced thread-parallel afterwards.
45 * At domain decomposition time:
46 * Each thread calls: ThreadForceBuffer.resizeBufferAndClearMask()
47 * Each thread calls: ThreadForceBuffer.addAtomToMask() for all atoms used in the buffer
48 * Each thread calls: ThreadForceBuffer.processMask()
49 * After that ThreadedForceBuffer.setupReduction() is called
51 * At force computation time:
52 * Each thread calls: ThreadForceBuffer.clearForcesAndEnergies().
53 * Each thread can then accumulate forces and energies into the buffers in ThreadForceBuffer.
54 * After that ThreadedForceBuffer.reduce() is called for thread-parallel reduction.
56 * \author Berk Hess <hess@kth.se>
59 #ifndef GMX_MDTYPES_THREADED_FORCE_BUFFER_H
60 #define GMX_MDTYPES_THREADED_FORCE_BUFFER_H
64 #include "gromacs/math/arrayrefwithpadding.h"
65 #include "gromacs/math/vectypes.h"
66 #include "gromacs/mdtypes/enerdata.h"
67 #include "gromacs/mdtypes/simulation_workload.h"
68 #include "gromacs/topology/ifunc.h"
69 #include "gromacs/utility/alignedallocator.h"
70 #include "gromacs/utility/arrayref.h"
71 #include "gromacs/utility/bitmask.h"
72 #include "gromacs/utility/classhelpers.h"
77 class ForceWithShiftForces;
81 * \brief Object that holds force and energies buffers plus a mask for a thread
83 * \tparam ForceBufferElementType The type for components of the normal force buffer: rvec or rvec4
85 template<typename ForceBufferElementType>
86 class ThreadForceBuffer
89 /* We reduce the force array in blocks of 2^5 atoms. This is large enough
90 * to not cause overhead and 32*sizeof(rvec) is a multiple of the cache-line
91 * size on all systems.
93 //! The log2 of the reduction block size
94 static constexpr int s_numReductionBlockBits = 5;
95 //! Force buffer block size in atoms
96 static constexpr int s_reductionBlockSize = (1 << s_numReductionBlockBits);
98 /*! \brief Constructor
99 * \param[in] threadIndex The index of the thread that will fill the buffers in this object
100 * \param[in] useEnergyTerms Whether the list of energy terms will be used
101 * \param[in] numEnergyGroups The number of non-bonded energy groups
103 ThreadForceBuffer(int threadIndex, bool useEnergyTerms, int numEnergyGroups);
105 //! Resizes the buffer to \p numAtoms and clears the mask
106 void resizeBufferAndClearMask(int numAtoms);
108 //! Adds atom with index \p atomIndex for reduction
109 void addAtomToMask(const int atomIndex)
111 bitmask_set_bit(&reductionMask_[atomIndex >> s_numReductionBlockBits], threadIndex_);
116 //! Returns the size of the force buffer in number of atoms
117 index size() const { return numAtoms_; }
119 //! Clears all force and energy buffers
120 void clearForcesAndEnergies();
122 //! Returns an array reference to the force buffer which is aligned for SIMD access
123 ArrayRef<ForceBufferElementType> forceBuffer()
125 return ArrayRef<ForceBufferElementType>(
126 reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data()),
127 reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data()) + numAtoms_);
130 /*! \brief Returns an array reference with padding to the force buffer which is aligned for SIMD access
132 * For RVec there is padding of one real for 4-wide SIMD access.
133 * For both RVec and rvec4 there is padding up to the block size for use in ThreadedForceBuffer.
135 ArrayRefWithPadding<ForceBufferElementType> forceBufferWithPadding()
137 return ArrayRefWithPadding<ForceBufferElementType>(
138 reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data()),
139 reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data()) + numAtoms_,
140 reinterpret_cast<ForceBufferElementType*>(forceBuffer_.data() + forceBuffer_.size()));
143 //! Returns a view of the shift force buffer
144 ArrayRef<RVec> shiftForces() { return shiftForces_; }
146 //! Returns a view of the energy terms, size F_NRE
147 ArrayRef<real> energyTerms() { return energyTerms_; }
149 //! Returns a reference to the energy group pair energies
150 gmx_grppairener_t& groupPairEnergies() { return groupPairEnergies_; }
152 //! Returns a reference to the dvdl terms
153 EnumerationArray<FreeEnergyPerturbationCouplingType, real>& dvdl() { return dvdl_; }
155 //! Returns a const view to the reduction masks
156 ArrayRef<const gmx_bitmask_t> reductionMask() const { return reductionMask_; }
159 //! Force array buffer, aligned to enable aligned SIMD access
160 std::vector<real, AlignedAllocator<real>> forceBuffer_;
161 //! Mask for marking which parts of f are filled, working array for constructing mask in setupReduction()
162 std::vector<gmx_bitmask_t> reductionMask_;
163 //! Index to touched blocks
164 std::vector<int> usedBlockIndices_;
165 //! The index of our thread
167 //! The number of atoms in the buffer
170 //! Shift force array, size c_numShiftVectors
171 std::vector<RVec> shiftForces_;
172 //! Energy array, can be empty
173 std::vector<real> energyTerms_;
174 //! Group pair energy data for pairs
175 gmx_grppairener_t groupPairEnergies_;
176 //! Free-energy dV/dl output
177 gmx::EnumerationArray<FreeEnergyPerturbationCouplingType, real> dvdl_;
179 // Disallow copy and assign, remove this we we get rid of f_
180 GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(ThreadForceBuffer);
184 * \brief Class for accumulating and reducing forces and energies on threads in parallel
186 * \tparam ForceBufferElementType The type for components of the normal force buffer: rvec or rvec4
188 template<typename ForceBufferElementType>
189 class ThreadedForceBuffer
192 /*! \brief Constructor
193 * \param[in] numThreads The number of threads that will use the buffers and reduce
194 * \param[in] useEnergyTerms Whether the list of energy terms will be used
195 * \param[in] numEnergyGroups The number of non-bonded energy groups
197 ThreadedForceBuffer(int numThreads, bool useEnergyTerms, int numEnergyGroups);
199 //! Returns the number of thread buffers
200 int numThreadBuffers() const { return threadForceBuffers_.size(); }
202 //! Returns a reference to the buffer object for the thread with index \p bufferIndex
203 ThreadForceBuffer<ForceBufferElementType>& threadForceBuffer(int bufferIndex)
205 return *threadForceBuffers_[bufferIndex];
208 //! Sets up the reduction, should be called after generating the masks on each thread
209 void setupReduction();
211 /*! \brief Reduces forces and energies, as requested by \p stepWork
213 * The reduction of all output starts at the output from thread \p reductionBeginIndex,
214 * except for the normal force buffer, which always starts at 0.
216 * Buffers that will not be used as indicated by the flags in \p stepWork
217 * are allowed to be nullptr or empty.
219 void reduce(gmx::ForceWithShiftForces* forceWithShiftForces,
221 gmx_grppairener_t* grpp,
222 gmx::ArrayRef<real> dvdl,
223 const gmx::StepWorkload& stepWork,
224 int reductionBeginIndex);
227 //! Whether the energy buffer is used
228 bool useEnergyTerms_;
229 //! Force/energy data per thread, size nthreads, stored in unique_ptr to allow thread local allocation
230 std::vector<std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> threadForceBuffers_;
231 //! Indices of blocks that are used, i.e. have force contributions.
232 std::vector<int> usedBlockIndices_;
233 //! Mask array, one element corresponds to a block of reduction_block_size atoms of the force array, bit corresponding to thread indices set if a thread writes to that block
234 std::vector<gmx_bitmask_t> reductionMask_;
235 //! The number of atoms forces are computed for
236 int numAtomsForce_ = 0;
238 // Disallow copies to avoid sub-optimal ownership of allocated memory
239 GMX_DISALLOW_COPY_MOVE_AND_ASSIGN(ThreadedForceBuffer);
242 // Instantiate for RVec
243 extern template class ThreadForceBuffer<RVec>;
244 extern template class ThreadedForceBuffer<RVec>;
246 // Instantiate for rvec4
247 extern template class ThreadForceBuffer<rvec4>;
248 extern template class ThreadedForceBuffer<rvec4>;