Split simulationWork.useGpuBufferOps into separate x and f flags
[alexxy/gromacs.git] / src / gromacs / mdtypes / threaded_force_buffer.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2021, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35 /*! \internal \file
36  *
37  * \brief This file defines the implementation of ThreadForceBuffer and ThreadedForceBuffer.
38  *
39  * \author Berk Hess <hess@kth.se>
40  *
41  * \ingroup module_mdtypes
42  */
43 #include "gmxpre.h"
44
45 #include "threaded_force_buffer.h"
46
47 #include "gromacs/math/vec.h"
48 #include "gromacs/mdtypes/forceoutput.h"
49 #include "gromacs/pbcutil/ishift.h"
50 #include "gromacs/utility/alignedallocator.h"
51 #include "gromacs/utility/exceptions.h"
52 #include "gromacs/utility/fatalerror.h"
53
54 namespace gmx
55 {
56
57 /*! \brief The max thread number is arbitrary, we used a fixed number
58  * to avoid memory management.  Using more than 16 threads is probably
59  * never useful performance wise. */
60 static constexpr int s_maxNumThreadsForReduction = 256;
61
62 template<typename ForceBufferElementType>
63 ThreadForceBuffer<ForceBufferElementType>::ThreadForceBuffer(const int  threadIndex,
64                                                              const bool useEnergyTerms,
65                                                              const int  numEnergyGroups) :
66     threadIndex_(threadIndex), shiftForces_(c_numShiftVectors), groupPairEnergies_(numEnergyGroups)
67 {
68     if (useEnergyTerms)
69     {
70         energyTerms_.resize(F_NRE);
71     }
72 }
73
74 template<typename ForceBufferElementType>
75 void ThreadForceBuffer<ForceBufferElementType>::clearForcesAndEnergies()
76 {
77     constexpr int c_numComponents = sizeof(ForceBufferElementType) / sizeof(real);
78
79     for (int atomIndex : usedBlockIndices_)
80     {
81         const int bufferIndexBegin = atomIndex * s_reductionBlockSize * c_numComponents;
82         const int bufferIndexEnd   = bufferIndexBegin + s_reductionBlockSize * c_numComponents;
83         std::fill(forceBuffer_.begin() + bufferIndexBegin, forceBuffer_.begin() + bufferIndexEnd, 0.0_real);
84     }
85
86     const RVec zeroVec = { 0.0_real, 0.0_real, 0.0_real };
87     std::fill(shiftForces_.begin(), shiftForces_.end(), zeroVec);
88
89     std::fill(energyTerms_.begin(), energyTerms_.end(), 0.0_real);
90
91     for (int i = 0; i < static_cast<int>(NonBondedEnergyTerms::Count); i++)
92     {
93         for (int j = 0; j < groupPairEnergies_.nener; j++)
94         {
95             groupPairEnergies_.energyGroupPairTerms[i][j] = 0.0_real;
96         }
97     }
98     for (auto i : keysOf(dvdl_))
99     {
100         dvdl_[i] = 0;
101     }
102 }
103
104 template<typename ForceBufferElementType>
105 void ThreadForceBuffer<ForceBufferElementType>::resizeBufferAndClearMask(const int numAtoms)
106 {
107     numAtoms_ = numAtoms;
108
109     const int numBlocks = (numAtoms + s_reductionBlockSize - 1) >> s_numReductionBlockBits;
110
111     reductionMask_.resize(numBlocks);
112
113     constexpr size_t c_numComponentsInElement = sizeof(ForceBufferElementType) / sizeof(real);
114     int              newNumElements           = numBlocks * s_reductionBlockSize;
115     if (c_numComponentsInElement != 4 && newNumElements == numAtoms)
116     {
117         // Pad with one element to allow 4-wide SIMD loads and stores.
118         // Note that actually only one real is needed, but we need a whole element for the ArrayRef.
119         newNumElements += 1;
120     }
121     forceBuffer_.resize(newNumElements * c_numComponentsInElement);
122
123     for (gmx_bitmask_t& mask : reductionMask_)
124     {
125         bitmask_clear(&mask);
126     }
127 }
128
129 template<typename ForceBufferElementType>
130 void ThreadForceBuffer<ForceBufferElementType>::processMask()
131 {
132     // Now we are done setting the masks, generate the new list of used blocks
133     usedBlockIndices_.clear();
134     for (int b = 0; b < ssize(reductionMask_); b++)
135     {
136         if (bitmask_is_set(reductionMask_[b], threadIndex_))
137         {
138             usedBlockIndices_.push_back(b);
139         }
140     }
141 }
142
143 namespace
144 {
145
146 //! \brief Reduce thread-local force buffers into \p force (does not reduce shift forces)
147 template<typename ForceBufferElementType>
148 void reduceThreadForceBuffers(ArrayRef<gmx::RVec> force,
149                               ArrayRef<std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> threadForceBuffers,
150                               ArrayRef<const gmx_bitmask_t> masks,
151                               ArrayRef<const int>           usedBlockIndices)
152 {
153     const int numBuffers = threadForceBuffers.size();
154     GMX_ASSERT(numBuffers <= s_maxNumThreadsForReduction,
155                "There is a limit on the number of buffers we can use for reduction");
156
157     const int numAtoms = threadForceBuffers[0]->size();
158
159     rvec* gmx_restrict f = as_rvec_array(force.data());
160
161     /* This reduction can run on any number of threads, independently of the number of buffers.
162      * But if the number of threads matches the number of buffers, which it currently does,
163      * the uniform distribution of the touched blocks over nthreads will
164      * match the distribution of bonded over threads well in most cases,
165      * which means that threads mostly reduce their own data which increases
166      * the number of cache hits.
167      * Additionally, we should always use the same number of threads in parallel
168      * regions in OpenMP, otherwise the performance will degrade significantly.
169      */
170     const int gmx_unused numThreadsForReduction = threadForceBuffers.size();
171 #pragma omp parallel for num_threads(numThreadsForReduction) schedule(static)
172     for (int b = 0; b < usedBlockIndices.ssize(); b++)
173     {
174         try
175         {
176             // Reduce the buffers that contribute to this block
177             ForceBufferElementType* fp[s_maxNumThreadsForReduction];
178
179             const int blockIndex = usedBlockIndices[b];
180
181             // Make a list of threads that have this block index set in the mask
182             int numContributingBuffers = 0;
183             for (int ft = 0; ft < numBuffers; ft++)
184             {
185                 if (bitmask_is_set(masks[blockIndex], ft))
186                 {
187                     fp[numContributingBuffers++] =
188                             threadForceBuffers[ft]->forceBufferWithPadding().paddedArrayRef().data();
189                 }
190             }
191             if (numContributingBuffers > 0)
192             {
193                 // Reduce the selected buffers
194                 int a0 = blockIndex * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize;
195                 int a1 = (blockIndex + 1) * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize;
196                 // Note: It would be nice if we could pad f to avoid this min()
197                 a1 = std::min(a1, numAtoms);
198                 if (numContributingBuffers == 1)
199                 {
200                     // Avoid double loop for the case of a single buffer
201                     for (int a = a0; a < a1; a++)
202                     {
203                         rvec_inc(f[a], fp[0][a]);
204                     }
205                 }
206                 else
207                 {
208                     for (int a = a0; a < a1; a++)
209                     {
210                         for (int fb = 0; fb < numContributingBuffers; fb++)
211                         {
212                             rvec_inc(f[a], fp[fb][a]);
213                         }
214                     }
215                 }
216             }
217         }
218         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
219     }
220 }
221
222 } // namespace
223
224 template<typename ForceBufferElementType>
225 ThreadedForceBuffer<ForceBufferElementType>::ThreadedForceBuffer(const int  numThreads,
226                                                                  const bool useEnergyTerms,
227                                                                  const int  numEnergyGroups) :
228     useEnergyTerms_(useEnergyTerms)
229 {
230     threadForceBuffers_.resize(numThreads);
231 #pragma omp parallel for num_threads(numThreads) schedule(static)
232     for (int t = 0; t < numThreads; t++)
233     {
234         try
235         {
236             /* Note that thread 0 uses the global fshift and energy arrays,
237              * but to keep the code simple, we initialize all data here.
238              */
239             threadForceBuffers_[t] = std::make_unique<ThreadForceBuffer<ForceBufferElementType>>(
240                     t, useEnergyTerms_, numEnergyGroups);
241         }
242         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
243     }
244 }
245
246 template<typename ForceBufferElementType>
247 void ThreadedForceBuffer<ForceBufferElementType>::setupReduction()
248 {
249     const int numBuffers = threadForceBuffers_.size();
250
251     const int numAtoms = threadForceBuffers_[0]->size();
252
253     const int totalNumBlocks =
254             (numAtoms + ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize - 1)
255             >> ThreadForceBuffer<ForceBufferElementType>::s_numReductionBlockBits;
256
257     // Check that all thread buffers have matching sizes
258     for (const auto& threadForceBuffer : threadForceBuffers_)
259     {
260         GMX_RELEASE_ASSERT(threadForceBuffer->size() == numAtoms,
261                            "All buffers should have the same size");
262         GMX_RELEASE_ASSERT(threadForceBuffer->reductionMask().ssize() == totalNumBlocks,
263                            "The block count should match");
264     }
265
266     /* Reduce the masks over the threads and determine which blocks
267      * we need to reduce over.
268      */
269     reductionMask_.resize(totalNumBlocks);
270
271     usedBlockIndices_.clear();
272     int numBlocksUsed = 0;
273     for (int b = 0; b < totalNumBlocks; b++)
274     {
275         gmx_bitmask_t& mask = reductionMask_[b];
276
277         /* Generate the union over the threads of the bitmask */
278         bitmask_clear(&mask);
279         for (int t = 0; t < numBuffers; t++)
280         {
281             bitmask_union(&mask, threadForceBuffers_[t]->reductionMask()[b]);
282         }
283         if (!bitmask_is_zero(mask))
284         {
285             usedBlockIndices_.push_back(b);
286         }
287
288         if (debug)
289         {
290             int c = 0;
291             for (int t = 0; t < numBuffers; t++)
292             {
293                 if (bitmask_is_set(mask, t))
294                 {
295                     c++;
296                 }
297             }
298             numBlocksUsed += c;
299
300             if (gmx_debug_at)
301             {
302                 fprintf(debug, "block %d flags %s count %d\n", b, to_hex_string(mask).c_str(), c);
303             }
304         }
305     }
306     if (debug)
307     {
308         fprintf(debug,
309                 "Number of %d atom blocks to reduce: %d\n",
310                 ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize,
311                 int(ssize(usedBlockIndices_)));
312         fprintf(debug,
313                 "Reduction density %.2f for touched blocks only %.2f\n",
314                 numBlocksUsed * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize
315                         / static_cast<double>(numAtoms),
316                 numBlocksUsed / static_cast<double>(ssize(usedBlockIndices_)));
317     }
318 }
319
320 template<typename ForceBufferElementType>
321 void ThreadedForceBuffer<ForceBufferElementType>::reduce(gmx::ForceWithShiftForces* forceWithShiftForces,
322                                                          real*                      ener,
323                                                          gmx_grppairener_t*         grpp,
324                                                          gmx::ArrayRef<real>        dvdl,
325                                                          const gmx::StepWorkload& stepWork,
326                                                          const int reductionBeginIndex)
327 {
328     if (stepWork.computeForces && !usedBlockIndices_.empty())
329     {
330         /* Reduce the force buffer */
331         GMX_ASSERT(forceWithShiftForces, "Need a valid force buffer for reduction");
332
333         reduceThreadForceBuffers<ForceBufferElementType>(
334                 forceWithShiftForces->force(), threadForceBuffers_, reductionMask_, usedBlockIndices_);
335     }
336
337     const int numBuffers = numThreadBuffers();
338
339     /* When necessary, reduce energy and virial using one thread only */
340     if ((stepWork.computeEnergy || stepWork.computeVirial || stepWork.computeDhdl)
341         && numBuffers > reductionBeginIndex)
342     {
343         gmx::ArrayRef<const std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> f_t =
344                 threadForceBuffers_;
345
346         if (stepWork.computeVirial)
347         {
348             GMX_ASSERT(forceWithShiftForces, "Need a valid force buffer for reduction");
349
350             rvec* gmx_restrict fshift = as_rvec_array(forceWithShiftForces->shiftForces().data());
351
352             for (int i = 0; i < gmx::c_numShiftVectors; i++)
353             {
354                 for (int t = reductionBeginIndex; t < numBuffers; t++)
355                 {
356                     rvec_inc(fshift[i], f_t[t]->shiftForces()[i]);
357                 }
358             }
359         }
360         if (stepWork.computeEnergy && useEnergyTerms_)
361         {
362             GMX_ASSERT(ener, "Need a valid energy buffer for reduction");
363
364             for (int i = 0; i < F_NRE; i++)
365             {
366                 for (int t = reductionBeginIndex; t < numBuffers; t++)
367                 {
368                     ener[i] += f_t[t]->energyTerms()[i];
369                 }
370             }
371         }
372
373         if (stepWork.computeEnergy)
374         {
375             GMX_ASSERT(grpp, "Need a valid group pair energy buffer for reduction");
376
377             for (int i = 0; i < static_cast<int>(NonBondedEnergyTerms::Count); i++)
378             {
379                 for (int j = 0; j < f_t[0]->groupPairEnergies().nener; j++)
380                 {
381                     for (int t = reductionBeginIndex; t < numBuffers; t++)
382                     {
383                         grpp->energyGroupPairTerms[i][j] +=
384                                 f_t[t]->groupPairEnergies().energyGroupPairTerms[i][j];
385                     }
386                 }
387             }
388         }
389         if (stepWork.computeDhdl)
390         {
391             GMX_ASSERT(!dvdl.empty(), "Need a valid dV/dl buffer for reduction");
392
393             for (auto i : keysOf(f_t[0]->dvdl()))
394             {
395
396                 for (int t = reductionBeginIndex; t < numBuffers; t++)
397                 {
398                     dvdl[static_cast<int>(i)] += f_t[t]->dvdl()[i];
399                 }
400             }
401         }
402     }
403 }
404
405 template class ThreadForceBuffer<RVec>;
406 template class ThreadedForceBuffer<RVec>;
407
408 template class ThreadForceBuffer<rvec4>;
409 template class ThreadedForceBuffer<rvec4>;
410
411 } // namespace gmx