2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief This file defines the implementation of ThreadForceBuffer and ThreadedForceBuffer.
39 * \author Berk Hess <hess@kth.se>
41 * \ingroup module_mdtypes
45 #include "threaded_force_buffer.h"
47 #include "gromacs/math/vec.h"
48 #include "gromacs/mdtypes/forceoutput.h"
49 #include "gromacs/pbcutil/ishift.h"
50 #include "gromacs/utility/alignedallocator.h"
51 #include "gromacs/utility/exceptions.h"
52 #include "gromacs/utility/fatalerror.h"
57 /*! \brief The max thread number is arbitrary, we used a fixed number
58 * to avoid memory management. Using more than 16 threads is probably
59 * never useful performance wise. */
60 static constexpr int s_maxNumThreadsForReduction = 256;
62 template<typename ForceBufferElementType>
63 ThreadForceBuffer<ForceBufferElementType>::ThreadForceBuffer(const int threadIndex,
64 const bool useEnergyTerms,
65 const int numEnergyGroups) :
66 threadIndex_(threadIndex), shiftForces_(c_numShiftVectors), groupPairEnergies_(numEnergyGroups)
70 energyTerms_.resize(F_NRE);
74 template<typename ForceBufferElementType>
75 void ThreadForceBuffer<ForceBufferElementType>::clearForcesAndEnergies()
77 constexpr int c_numComponents = sizeof(ForceBufferElementType) / sizeof(real);
79 for (int atomIndex : usedBlockIndices_)
81 const int bufferIndexBegin = atomIndex * s_reductionBlockSize * c_numComponents;
82 const int bufferIndexEnd = bufferIndexBegin + s_reductionBlockSize * c_numComponents;
83 std::fill(forceBuffer_.begin() + bufferIndexBegin, forceBuffer_.begin() + bufferIndexEnd, 0.0_real);
86 const RVec zeroVec = { 0.0_real, 0.0_real, 0.0_real };
87 std::fill(shiftForces_.begin(), shiftForces_.end(), zeroVec);
89 std::fill(energyTerms_.begin(), energyTerms_.end(), 0.0_real);
91 for (int i = 0; i < static_cast<int>(NonBondedEnergyTerms::Count); i++)
93 for (int j = 0; j < groupPairEnergies_.nener; j++)
95 groupPairEnergies_.energyGroupPairTerms[i][j] = 0.0_real;
98 for (auto i : keysOf(dvdl_))
104 template<typename ForceBufferElementType>
105 void ThreadForceBuffer<ForceBufferElementType>::resizeBufferAndClearMask(const int numAtoms)
107 numAtoms_ = numAtoms;
109 const int numBlocks = (numAtoms + s_reductionBlockSize - 1) >> s_numReductionBlockBits;
111 reductionMask_.resize(numBlocks);
112 forceBuffer_.resize(numBlocks * s_reductionBlockSize * sizeof(ForceBufferElementType) / sizeof(real));
114 for (gmx_bitmask_t& mask : reductionMask_)
116 bitmask_clear(&mask);
120 template<typename ForceBufferElementType>
121 void ThreadForceBuffer<ForceBufferElementType>::processMask()
123 // Now we are done setting the masks, generate the new list of used blocks
124 usedBlockIndices_.clear();
125 for (int b = 0; b < ssize(reductionMask_); b++)
127 if (bitmask_is_set(reductionMask_[b], threadIndex_))
129 usedBlockIndices_.push_back(b);
137 //! \brief Reduce thread-local force buffers into \p force (does not reduce shift forces)
138 template<typename ForceBufferElementType>
139 void reduceThreadForceBuffers(ArrayRef<gmx::RVec> force,
140 ArrayRef<std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> threadForceBuffers,
141 ArrayRef<const gmx_bitmask_t> masks,
142 ArrayRef<const int> usedBlockIndices)
144 const int numBuffers = threadForceBuffers.size();
145 GMX_ASSERT(numBuffers <= s_maxNumThreadsForReduction,
146 "There is a limit on the number of buffers we can use for reduction");
148 const int numAtoms = threadForceBuffers[0]->size();
150 rvec* gmx_restrict f = as_rvec_array(force.data());
152 /* This reduction can run on any number of threads, independently of the number of buffers.
153 * But if the number of threads matches the number of buffers, which it currently does,
154 * the uniform distribution of the touched blocks over nthreads will
155 * match the distribution of bonded over threads well in most cases,
156 * which means that threads mostly reduce their own data which increases
157 * the number of cache hits.
158 * Additionally, we should always use the same number of threads in parallel
159 * regions in OpenMP, otherwise the performance will degrade significantly.
161 const int gmx_unused numThreadsForReduction = threadForceBuffers.size();
162 #pragma omp parallel for num_threads(numThreadsForReduction) schedule(static)
163 for (int b = 0; b < usedBlockIndices.ssize(); b++)
167 // Reduce the buffers that contribute to this block
168 ForceBufferElementType* fp[s_maxNumThreadsForReduction];
170 const int blockIndex = usedBlockIndices[b];
172 // Make a list of threads that have this block index set in the mask
173 int numContributingBuffers = 0;
174 for (int ft = 0; ft < numBuffers; ft++)
176 if (bitmask_is_set(masks[blockIndex], ft))
178 fp[numContributingBuffers++] = threadForceBuffers[ft]->forceBuffer();
181 if (numContributingBuffers > 0)
183 // Reduce the selected buffers
184 int a0 = blockIndex * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize;
185 int a1 = (blockIndex + 1) * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize;
186 // Note: It would be nice if we could pad f to avoid this min()
187 a1 = std::min(a1, numAtoms);
188 if (numContributingBuffers == 1)
190 // Avoid double loop for the case of a single buffer
191 for (int a = a0; a < a1; a++)
193 rvec_inc(f[a], fp[0][a]);
198 for (int a = a0; a < a1; a++)
200 for (int fb = 0; fb < numContributingBuffers; fb++)
202 rvec_inc(f[a], fp[fb][a]);
208 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
214 template<typename ForceBufferElementType>
215 ThreadedForceBuffer<ForceBufferElementType>::ThreadedForceBuffer(const int numThreads,
216 const bool useEnergyTerms,
217 const int numEnergyGroups) :
218 useEnergyTerms_(useEnergyTerms)
220 threadForceBuffers_.resize(numThreads);
221 #pragma omp parallel for num_threads(numThreads) schedule(static)
222 for (int t = 0; t < numThreads; t++)
226 /* Note that thread 0 uses the global fshift and energy arrays,
227 * but to keep the code simple, we initialize all data here.
229 threadForceBuffers_[t] = std::make_unique<ThreadForceBuffer<ForceBufferElementType>>(
230 t, useEnergyTerms_, numEnergyGroups);
232 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
236 template<typename ForceBufferElementType>
237 void ThreadedForceBuffer<ForceBufferElementType>::setupReduction()
239 const int numBuffers = threadForceBuffers_.size();
241 const int numAtoms = threadForceBuffers_[0]->size();
243 const int totalNumBlocks =
244 (numAtoms + ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize - 1)
245 >> ThreadForceBuffer<ForceBufferElementType>::s_numReductionBlockBits;
247 // Check that all thread buffers have matching sizes
248 for (const auto& threadForceBuffer : threadForceBuffers_)
250 GMX_RELEASE_ASSERT(threadForceBuffer->size() == numAtoms,
251 "All buffers should have the same size");
252 GMX_RELEASE_ASSERT(threadForceBuffer->reductionMask().ssize() == totalNumBlocks,
253 "The block count should match");
256 /* Reduce the masks over the threads and determine which blocks
257 * we need to reduce over.
259 reductionMask_.resize(totalNumBlocks);
261 usedBlockIndices_.clear();
262 int numBlocksUsed = 0;
263 for (int b = 0; b < totalNumBlocks; b++)
265 gmx_bitmask_t& mask = reductionMask_[b];
267 /* Generate the union over the threads of the bitmask */
268 bitmask_clear(&mask);
269 for (int t = 0; t < numBuffers; t++)
271 bitmask_union(&mask, threadForceBuffers_[t]->reductionMask()[b]);
273 if (!bitmask_is_zero(mask))
275 usedBlockIndices_.push_back(b);
281 for (int t = 0; t < numBuffers; t++)
283 if (bitmask_is_set(mask, t))
292 fprintf(debug, "block %d flags %s count %d\n", b, to_hex_string(mask).c_str(), c);
299 "Number of %d atom blocks to reduce: %d\n",
300 ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize,
301 int(ssize(usedBlockIndices_)));
303 "Reduction density %.2f for touched blocks only %.2f\n",
304 numBlocksUsed * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize
305 / static_cast<double>(numAtoms),
306 numBlocksUsed / static_cast<double>(ssize(usedBlockIndices_)));
310 template<typename ForceBufferElementType>
311 void ThreadedForceBuffer<ForceBufferElementType>::reduce(gmx::ForceWithShiftForces* forceWithShiftForces,
313 gmx_grppairener_t* grpp,
314 gmx::ArrayRef<real> dvdl,
315 const gmx::StepWorkload& stepWork,
316 const int reductionBeginIndex)
318 if (stepWork.computeForces && !usedBlockIndices_.empty())
320 /* Reduce the force buffer */
321 GMX_ASSERT(forceWithShiftForces, "Need a valid force buffer for reduction");
323 reduceThreadForceBuffers<ForceBufferElementType>(
324 forceWithShiftForces->force(), threadForceBuffers_, reductionMask_, usedBlockIndices_);
327 const int numBuffers = numThreadBuffers();
329 /* When necessary, reduce energy and virial using one thread only */
330 if ((stepWork.computeEnergy || stepWork.computeVirial || stepWork.computeDhdl)
331 && numBuffers > reductionBeginIndex)
333 gmx::ArrayRef<const std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> f_t =
336 if (stepWork.computeVirial)
338 GMX_ASSERT(forceWithShiftForces, "Need a valid force buffer for reduction");
340 rvec* gmx_restrict fshift = as_rvec_array(forceWithShiftForces->shiftForces().data());
342 for (int i = 0; i < gmx::c_numShiftVectors; i++)
344 for (int t = reductionBeginIndex; t < numBuffers; t++)
346 rvec_inc(fshift[i], f_t[t]->shiftForces()[i]);
350 if (stepWork.computeEnergy && useEnergyTerms_)
352 GMX_ASSERT(ener, "Need a valid energy buffer for reduction");
354 for (int i = 0; i < F_NRE; i++)
356 for (int t = reductionBeginIndex; t < numBuffers; t++)
358 ener[i] += f_t[t]->energyTerms()[i];
363 if (stepWork.computeEnergy)
365 GMX_ASSERT(grpp, "Need a valid group pair energy buffer for reduction");
367 for (int i = 0; i < static_cast<int>(NonBondedEnergyTerms::Count); i++)
369 for (int j = 0; j < f_t[0]->groupPairEnergies().nener; j++)
371 for (int t = reductionBeginIndex; t < numBuffers; t++)
373 grpp->energyGroupPairTerms[i][j] +=
374 f_t[t]->groupPairEnergies().energyGroupPairTerms[i][j];
379 if (stepWork.computeDhdl)
381 GMX_ASSERT(!dvdl.empty(), "Need a valid dV/dl buffer for reduction");
383 for (auto i : keysOf(f_t[0]->dvdl()))
386 for (int t = reductionBeginIndex; t < numBuffers; t++)
388 dvdl[static_cast<int>(i)] += f_t[t]->dvdl()[i];
395 template class ThreadForceBuffer<RVec>;
396 template class ThreadedForceBuffer<RVec>;
398 template class ThreadForceBuffer<rvec4>;
399 template class ThreadedForceBuffer<rvec4>;