c3dc2d92b0de97aa1f06b0761156a755bd2d3252
[alexxy/gromacs.git] / src / gromacs / mdtypes / threaded_force_buffer.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief This file defines the implementation of ThreadForceBuffer and ThreadedForceBuffer.
38  *
39  * \author Berk Hess <hess@kth.se>
40  *
41  * \ingroup module_mdtypes
42  */
43 #include "gmxpre.h"
44
45 #include "threaded_force_buffer.h"
46
47 #include "gromacs/math/vec.h"
48 #include "gromacs/mdtypes/forceoutput.h"
49 #include "gromacs/pbcutil/ishift.h"
50 #include "gromacs/utility/alignedallocator.h"
51 #include "gromacs/utility/exceptions.h"
52 #include "gromacs/utility/fatalerror.h"
53
54 namespace gmx
55 {
56
57 /*! \brief The max thread number is arbitrary, we used a fixed number
58  * to avoid memory management.  Using more than 16 threads is probably
59  * never useful performance wise. */
60 static constexpr int s_maxNumThreadsForReduction = 256;
61
62 template<typename ForceBufferElementType>
63 ThreadForceBuffer<ForceBufferElementType>::ThreadForceBuffer(const int threadIndex,
64                                                              const int numEnergyGroups) :
65     threadIndex_(threadIndex), shiftForces_(c_numShiftVectors), groupPairEnergies_(numEnergyGroups)
66 {
67 }
68
69 template<typename ForceBufferElementType>
70 void ThreadForceBuffer<ForceBufferElementType>::clearForcesAndEnergies()
71 {
72     constexpr int c_numComponents = sizeof(ForceBufferElementType) / sizeof(real);
73
74     for (int atomIndex : usedBlockIndices_)
75     {
76         const int bufferIndexBegin = atomIndex * s_reductionBlockSize * c_numComponents;
77         const int bufferIndexEnd   = bufferIndexBegin + s_reductionBlockSize * c_numComponents;
78         std::fill(forceBuffer_.begin() + bufferIndexBegin, forceBuffer_.begin() + bufferIndexEnd, 0.0_real);
79     }
80
81     const RVec zeroVec = { 0.0_real, 0.0_real, 0.0_real };
82     std::fill(shiftForces_.begin(), shiftForces_.end(), zeroVec);
83
84     std::fill(energyTerms_.begin(), energyTerms_.end(), 0.0_real);
85
86     for (int i = 0; i < static_cast<int>(NonBondedEnergyTerms::Count); i++)
87     {
88         for (int j = 0; j < groupPairEnergies_.nener; j++)
89         {
90             groupPairEnergies_.energyGroupPairTerms[i][j] = 0.0_real;
91         }
92     }
93     for (auto i : keysOf(dvdl_))
94     {
95         dvdl_[i] = 0;
96     }
97 }
98
99 template<typename ForceBufferElementType>
100 void ThreadForceBuffer<ForceBufferElementType>::resizeBufferAndClearMask(const int numAtoms)
101 {
102     numAtoms_ = numAtoms;
103
104     const int numBlocks = (numAtoms + s_reductionBlockSize - 1) >> s_numReductionBlockBits;
105
106     reductionMask_.resize(numBlocks);
107     forceBuffer_.resize(numBlocks * s_reductionBlockSize * sizeof(ForceBufferElementType) / sizeof(real));
108
109     for (gmx_bitmask_t& mask : reductionMask_)
110     {
111         bitmask_clear(&mask);
112     }
113 }
114
115 template<typename ForceBufferElementType>
116 void ThreadForceBuffer<ForceBufferElementType>::processMask()
117 {
118     // Now we are done setting the masks, generate the new list of used blocks
119     usedBlockIndices_.clear();
120     for (int b = 0; b < ssize(reductionMask_); b++)
121     {
122         if (bitmask_is_set(reductionMask_[b], threadIndex_))
123         {
124             usedBlockIndices_.push_back(b);
125         }
126     }
127 }
128
129 namespace
130 {
131
132 //! \brief Reduce thread-local force buffers into \p force (does not reduce shift forces)
133 template<typename ForceBufferElementType>
134 void reduceThreadForceBuffers(ArrayRef<gmx::RVec> force,
135                               ArrayRef<std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> threadForceBuffers,
136                               ArrayRef<const gmx_bitmask_t> masks,
137                               ArrayRef<const int>           usedBlockIndices)
138 {
139     const int numBuffers = threadForceBuffers.size();
140     GMX_ASSERT(numBuffers <= s_maxNumThreadsForReduction,
141                "There is a limit on the number of buffers we can use for reduction");
142
143     const int numAtoms = threadForceBuffers[0]->size();
144
145     rvec* gmx_restrict f = as_rvec_array(force.data());
146
147     /* This reduction can run on any number of threads, independently of the number of buffers.
148      * But if the number of threads matches the number of buffers, which it currently does,
149      * the uniform distribution of the touched blocks over nthreads will
150      * match the distribution of bonded over threads well in most cases,
151      * which means that threads mostly reduce their own data which increases
152      * the number of cache hits.
153      * Additionally, we should always use the same number of threads in parallel
154      * regions in OpenMP, otherwise the performance will degrade significantly.
155      */
156     const int gmx_unused numThreadsForReduction = threadForceBuffers.size();
157 #pragma omp parallel for num_threads(numThreadsForReduction) schedule(static)
158     for (int b = 0; b < usedBlockIndices.ssize(); b++)
159     {
160         try
161         {
162             // Reduce the buffers that contribute to this block
163             ForceBufferElementType* fp[s_maxNumThreadsForReduction];
164
165             const int blockIndex = usedBlockIndices[b];
166
167             // Make a list of threads that have this block index set in the mask
168             int numContributingBuffers = 0;
169             for (int ft = 0; ft < numBuffers; ft++)
170             {
171                 if (bitmask_is_set(masks[blockIndex], ft))
172                 {
173                     fp[numContributingBuffers++] = threadForceBuffers[ft]->forceBuffer();
174                 }
175             }
176             if (numContributingBuffers > 0)
177             {
178                 // Reduce the selected buffers
179                 int a0 = blockIndex * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize;
180                 int a1 = (blockIndex + 1) * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize;
181                 // Note: It would be nice if we could pad f to avoid this min()
182                 a1 = std::min(a1, numAtoms);
183                 if (numContributingBuffers == 1)
184                 {
185                     // Avoid double loop for the case of a single buffer
186                     for (int a = a0; a < a1; a++)
187                     {
188                         rvec_inc(f[a], fp[0][a]);
189                     }
190                 }
191                 else
192                 {
193                     for (int a = a0; a < a1; a++)
194                     {
195                         for (int fb = 0; fb < numContributingBuffers; fb++)
196                         {
197                             rvec_inc(f[a], fp[fb][a]);
198                         }
199                     }
200                 }
201             }
202         }
203         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
204     }
205 }
206
207 } // namespace
208
209 template<typename ForceBufferElementType>
210 ThreadedForceBuffer<ForceBufferElementType>::ThreadedForceBuffer(const int numThreads, const int numEnergyGroups)
211 {
212     threadForceBuffers_.resize(numThreads);
213 #pragma omp parallel for num_threads(numThreads) schedule(static)
214     for (int t = 0; t < numThreads; t++)
215     {
216         try
217         {
218             /* Note that thread 0 uses the global fshift and energy arrays,
219              * but to keep the code simple, we initialize all data here.
220              */
221             threadForceBuffers_[t] =
222                     std::make_unique<ThreadForceBuffer<ForceBufferElementType>>(t, numEnergyGroups);
223         }
224         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
225     }
226 }
227
228 template<typename ForceBufferElementType>
229 void ThreadedForceBuffer<ForceBufferElementType>::setupReduction()
230 {
231     const int numBuffers = threadForceBuffers_.size();
232
233     const int numAtoms = threadForceBuffers_[0]->size();
234
235     const int totalNumBlocks =
236             (numAtoms + ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize - 1)
237             >> ThreadForceBuffer<ForceBufferElementType>::s_numReductionBlockBits;
238
239     // Check that all thread buffers have matching sizes
240     for (const auto& threadForceBuffer : threadForceBuffers_)
241     {
242         GMX_RELEASE_ASSERT(threadForceBuffer->size() == numAtoms,
243                            "All buffers should have the same size");
244         GMX_RELEASE_ASSERT(threadForceBuffer->reductionMask().ssize() == totalNumBlocks,
245                            "The block count should match");
246     }
247
248     /* Reduce the masks over the threads and determine which blocks
249      * we need to reduce over.
250      */
251     reductionMask_.resize(totalNumBlocks);
252
253     usedBlockIndices_.clear();
254     int numBlocksUsed = 0;
255     for (int b = 0; b < totalNumBlocks; b++)
256     {
257         gmx_bitmask_t& mask = reductionMask_[b];
258
259         /* Generate the union over the threads of the bitmask */
260         bitmask_clear(&mask);
261         for (int t = 0; t < numBuffers; t++)
262         {
263             bitmask_union(&mask, threadForceBuffers_[t]->reductionMask()[b]);
264         }
265         if (!bitmask_is_zero(mask))
266         {
267             usedBlockIndices_.push_back(b);
268         }
269
270         if (debug)
271         {
272             int c = 0;
273             for (int t = 0; t < numBuffers; t++)
274             {
275                 if (bitmask_is_set(mask, t))
276                 {
277                     c++;
278                 }
279             }
280             numBlocksUsed += c;
281
282             if (gmx_debug_at)
283             {
284                 fprintf(debug, "block %d flags %s count %d\n", b, to_hex_string(mask).c_str(), c);
285             }
286         }
287     }
288     if (debug)
289     {
290         fprintf(debug,
291                 "Number of %d atom blocks to reduce: %d\n",
292                 ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize,
293                 int(ssize(usedBlockIndices_)));
294         fprintf(debug,
295                 "Reduction density %.2f for touched blocks only %.2f\n",
296                 numBlocksUsed * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize
297                         / static_cast<double>(numAtoms),
298                 numBlocksUsed / static_cast<double>(ssize(usedBlockIndices_)));
299     }
300 }
301
302 template<typename ForceBufferElementType>
303 void ThreadedForceBuffer<ForceBufferElementType>::reduce(gmx::ForceWithShiftForces* forceWithShiftForces,
304                                                          real*                      ener,
305                                                          gmx_grppairener_t*         grpp,
306                                                          gmx::ArrayRef<real>        dvdl,
307                                                          const gmx::StepWorkload& stepWork,
308                                                          const int reductionBeginIndex)
309 {
310     if (!usedBlockIndices_.empty())
311     {
312         /* Reduce the bonded force buffer */
313         reduceThreadForceBuffers<ForceBufferElementType>(
314                 forceWithShiftForces->force(), threadForceBuffers_, reductionMask_, usedBlockIndices_);
315     }
316
317     rvec* gmx_restrict fshift = as_rvec_array(forceWithShiftForces->shiftForces().data());
318
319     const int numBuffers = numThreadBuffers();
320
321     /* When necessary, reduce energy and virial using one thread only */
322     if ((stepWork.computeEnergy || stepWork.computeVirial || stepWork.computeDhdl)
323         && numBuffers > reductionBeginIndex)
324     {
325         gmx::ArrayRef<const std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> f_t =
326                 threadForceBuffers_;
327
328         if (stepWork.computeVirial)
329         {
330             for (int i = 0; i < gmx::c_numShiftVectors; i++)
331             {
332                 for (int t = reductionBeginIndex; t < numBuffers; t++)
333                 {
334                     rvec_inc(fshift[i], f_t[t]->shiftForces()[i]);
335                 }
336             }
337         }
338         if (stepWork.computeEnergy)
339         {
340             for (int i = 0; i < F_NRE; i++)
341             {
342                 for (int t = reductionBeginIndex; t < numBuffers; t++)
343                 {
344                     ener[i] += f_t[t]->energyTerms()[i];
345                 }
346             }
347             for (int i = 0; i < static_cast<int>(NonBondedEnergyTerms::Count); i++)
348             {
349                 for (int j = 0; j < f_t[0]->groupPairEnergies().nener; j++)
350                 {
351                     for (int t = reductionBeginIndex; t < numBuffers; t++)
352                     {
353                         grpp->energyGroupPairTerms[i][j] +=
354                                 f_t[t]->groupPairEnergies().energyGroupPairTerms[i][j];
355                     }
356                 }
357             }
358         }
359         if (stepWork.computeDhdl)
360         {
361             for (auto i : keysOf(f_t[0]->dvdl()))
362             {
363
364                 for (int t = reductionBeginIndex; t < numBuffers; t++)
365                 {
366                     dvdl[static_cast<int>(i)] += f_t[t]->dvdl()[i];
367                 }
368             }
369         }
370     }
371 }
372
373 template class ThreadForceBuffer<rvec4>;
374 template class ThreadedForceBuffer<rvec4>;
375
376 } // namespace gmx