src/gromacs/mdtypes/threaded_force_buffer.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief This file defines the implementation of ThreadForceBuffer and ThreadedForceBuffer.
  38  *
  39  * \author Berk Hess <hess@kth.se>
  40  *
  41  * \ingroup module_mdtypes
  42  */
  43 #include "gmxpre.h"
  44
  45 #include "threaded_force_buffer.h"
  46
  47 #include "gromacs/math/vec.h"
  48 #include "gromacs/mdtypes/forceoutput.h"
  49 #include "gromacs/pbcutil/ishift.h"
  50 #include "gromacs/utility/alignedallocator.h"
  51 #include "gromacs/utility/exceptions.h"
  52 #include "gromacs/utility/fatalerror.h"
  53
  54 namespace gmx
  55 {
  56
  57 /*! \brief The max thread number is arbitrary, we used a fixed number
  58  * to avoid memory management.  Using more than 16 threads is probably
  59  * never useful performance wise. */
  60 static constexpr int s_maxNumThreadsForReduction = 256;
  61
  62 template<typename ForceBufferElementType>
  63 ThreadForceBuffer<ForceBufferElementType>::ThreadForceBuffer(const int  threadIndex,
  64                                                              const bool useEnergyTerms,
  65                                                              const int  numEnergyGroups) :
  66     threadIndex_(threadIndex), shiftForces_(c_numShiftVectors), groupPairEnergies_(numEnergyGroups)
  67 {
  68     if (useEnergyTerms)
  69     {
  70         energyTerms_.resize(F_NRE);
  71     }
  72 }
  73
  74 template<typename ForceBufferElementType>
  75 void ThreadForceBuffer<ForceBufferElementType>::clearForcesAndEnergies()
  76 {
  77     constexpr int c_numComponents = sizeof(ForceBufferElementType) / sizeof(real);
  78
  79     for (int atomIndex : usedBlockIndices_)
  80     {
  81         const int bufferIndexBegin = atomIndex * s_reductionBlockSize * c_numComponents;
  82         const int bufferIndexEnd   = bufferIndexBegin + s_reductionBlockSize * c_numComponents;
  83         std::fill(forceBuffer_.begin() + bufferIndexBegin, forceBuffer_.begin() + bufferIndexEnd, 0.0_real);
  84     }
  85
  86     const RVec zeroVec = { 0.0_real, 0.0_real, 0.0_real };
  87     std::fill(shiftForces_.begin(), shiftForces_.end(), zeroVec);
  88
  89     std::fill(energyTerms_.begin(), energyTerms_.end(), 0.0_real);
  90
  91     for (int i = 0; i < static_cast<int>(NonBondedEnergyTerms::Count); i++)
  92     {
  93         for (int j = 0; j < groupPairEnergies_.nener; j++)
  94         {
  95             groupPairEnergies_.energyGroupPairTerms[i][j] = 0.0_real;
  96         }
  97     }
  98     for (auto i : keysOf(dvdl_))
  99     {
 100         dvdl_[i] = 0;
 101     }
 102 }
 103
 104 template<typename ForceBufferElementType>
 105 void ThreadForceBuffer<ForceBufferElementType>::resizeBufferAndClearMask(const int numAtoms)
 106 {
 107     numAtoms_ = numAtoms;
 108
 109     const int numBlocks = (numAtoms + s_reductionBlockSize - 1) >> s_numReductionBlockBits;
 110
 111     reductionMask_.resize(numBlocks);
 112
 113     constexpr size_t c_numComponentsInElement = sizeof(ForceBufferElementType) / sizeof(real);
 114     int              newNumElements           = numBlocks * s_reductionBlockSize;
 115     if (c_numComponentsInElement != 4 && newNumElements == numAtoms)
 116     {
 117         // Pad with one element to allow 4-wide SIMD loads and stores.
 118         // Note that actually only one real is needed, but we need a whole element for the ArrayRef.
 119         newNumElements += 1;
 120     }
 121     forceBuffer_.resize(newNumElements * c_numComponentsInElement);
 122
 123     for (gmx_bitmask_t& mask : reductionMask_)
 124     {
 125         bitmask_clear(&mask);
 126     }
 127 }
 128
 129 template<typename ForceBufferElementType>
 130 void ThreadForceBuffer<ForceBufferElementType>::processMask()
 131 {
 132     // Now we are done setting the masks, generate the new list of used blocks
 133     usedBlockIndices_.clear();
 134     for (int b = 0; b < ssize(reductionMask_); b++)
 135     {
 136         if (bitmask_is_set(reductionMask_[b], threadIndex_))
 137         {
 138             usedBlockIndices_.push_back(b);
 139         }
 140     }
 141 }
 142
 143 namespace
 144 {
 145
 146 //! \brief Reduce thread-local force buffers into \p force (does not reduce shift forces)
 147 template<typename ForceBufferElementType>
 148 void reduceThreadForceBuffers(ArrayRef<gmx::RVec> force,
 149                               ArrayRef<std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> threadForceBuffers,
 150                               ArrayRef<const gmx_bitmask_t> masks,
 151                               ArrayRef<const int>           usedBlockIndices)
 152 {
 153     const int numBuffers = threadForceBuffers.size();
 154     GMX_ASSERT(numBuffers <= s_maxNumThreadsForReduction,
 155                "There is a limit on the number of buffers we can use for reduction");
 156
 157     const int numAtoms = threadForceBuffers[0]->size();
 158
 159     rvec* gmx_restrict f = as_rvec_array(force.data());
 160
 161     /* This reduction can run on any number of threads, independently of the number of buffers.
 162      * But if the number of threads matches the number of buffers, which it currently does,
 163      * the uniform distribution of the touched blocks over nthreads will
 164      * match the distribution of bonded over threads well in most cases,
 165      * which means that threads mostly reduce their own data which increases
 166      * the number of cache hits.
 167      * Additionally, we should always use the same number of threads in parallel
 168      * regions in OpenMP, otherwise the performance will degrade significantly.
 169      */
 170     const int gmx_unused numThreadsForReduction = threadForceBuffers.size();
 171 #pragma omp parallel for num_threads(numThreadsForReduction) schedule(static)
 172     for (int b = 0; b < usedBlockIndices.ssize(); b++)
 173     {
 174         try
 175         {
 176             // Reduce the buffers that contribute to this block
 177             ForceBufferElementType* fp[s_maxNumThreadsForReduction];
 178
 179             const int blockIndex = usedBlockIndices[b];
 180
 181             // Make a list of threads that have this block index set in the mask
 182             int numContributingBuffers = 0;
 183             for (int ft = 0; ft < numBuffers; ft++)
 184             {
 185                 if (bitmask_is_set(masks[blockIndex], ft))
 186                 {
 187                     fp[numContributingBuffers++] =
 188                             threadForceBuffers[ft]->forceBufferWithPadding().paddedArrayRef().data();
 189                 }
 190             }
 191             if (numContributingBuffers > 0)
 192             {
 193                 // Reduce the selected buffers
 194                 int a0 = blockIndex * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize;
 195                 int a1 = (blockIndex + 1) * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize;
 196                 // Note: It would be nice if we could pad f to avoid this min()
 197                 a1 = std::min(a1, numAtoms);
 198                 if (numContributingBuffers == 1)
 199                 {
 200                     // Avoid double loop for the case of a single buffer
 201                     for (int a = a0; a < a1; a++)
 202                     {
 203                         rvec_inc(f[a], fp[0][a]);
 204                     }
 205                 }
 206                 else
 207                 {
 208                     for (int a = a0; a < a1; a++)
 209                     {
 210                         for (int fb = 0; fb < numContributingBuffers; fb++)
 211                         {
 212                             rvec_inc(f[a], fp[fb][a]);
 213                         }
 214                     }
 215                 }
 216             }
 217         }
 218         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 219     }
 220 }
 221
 222 } // namespace
 223
 224 template<typename ForceBufferElementType>
 225 ThreadedForceBuffer<ForceBufferElementType>::ThreadedForceBuffer(const int  numThreads,
 226                                                                  const bool useEnergyTerms,
 227                                                                  const int  numEnergyGroups) :
 228     useEnergyTerms_(useEnergyTerms)
 229 {
 230     threadForceBuffers_.resize(numThreads);
 231 #pragma omp parallel for num_threads(numThreads) schedule(static)
 232     for (int t = 0; t < numThreads; t++)
 233     {
 234         try
 235         {
 236             /* Note that thread 0 uses the global fshift and energy arrays,
 237              * but to keep the code simple, we initialize all data here.
 238              */
 239             threadForceBuffers_[t] = std::make_unique<ThreadForceBuffer<ForceBufferElementType>>(
 240                     t, useEnergyTerms_, numEnergyGroups);
 241         }
 242         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 243     }
 244 }
 245
 246 template<typename ForceBufferElementType>
 247 void ThreadedForceBuffer<ForceBufferElementType>::setupReduction()
 248 {
 249     const int numBuffers = threadForceBuffers_.size();
 250
 251     const int numAtoms = threadForceBuffers_[0]->size();
 252
 253     const int totalNumBlocks =
 254             (numAtoms + ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize - 1)
 255             >> ThreadForceBuffer<ForceBufferElementType>::s_numReductionBlockBits;
 256
 257     // Check that all thread buffers have matching sizes
 258     for (const auto& threadForceBuffer : threadForceBuffers_)
 259     {
 260         GMX_RELEASE_ASSERT(threadForceBuffer->size() == numAtoms,
 261                            "All buffers should have the same size");
 262         GMX_RELEASE_ASSERT(threadForceBuffer->reductionMask().ssize() == totalNumBlocks,
 263                            "The block count should match");
 264     }
 265
 266     /* Reduce the masks over the threads and determine which blocks
 267      * we need to reduce over.
 268      */
 269     reductionMask_.resize(totalNumBlocks);
 270
 271     usedBlockIndices_.clear();
 272     int numBlocksUsed = 0;
 273     for (int b = 0; b < totalNumBlocks; b++)
 274     {
 275         gmx_bitmask_t& mask = reductionMask_[b];
 276
 277         /* Generate the union over the threads of the bitmask */
 278         bitmask_clear(&mask);
 279         for (int t = 0; t < numBuffers; t++)
 280         {
 281             bitmask_union(&mask, threadForceBuffers_[t]->reductionMask()[b]);
 282         }
 283         if (!bitmask_is_zero(mask))
 284         {
 285             usedBlockIndices_.push_back(b);
 286         }
 287
 288         if (debug)
 289         {
 290             int c = 0;
 291             for (int t = 0; t < numBuffers; t++)
 292             {
 293                 if (bitmask_is_set(mask, t))
 294                 {
 295                     c++;
 296                 }
 297             }
 298             numBlocksUsed += c;
 299
 300             if (gmx_debug_at)
 301             {
 302                 fprintf(debug, "block %d flags %s count %d\n", b, to_hex_string(mask).c_str(), c);
 303             }
 304         }
 305     }
 306     if (debug)
 307     {
 308         fprintf(debug,
 309                 "Number of %d atom blocks to reduce: %d\n",
 310                 ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize,
 311                 int(ssize(usedBlockIndices_)));
 312         fprintf(debug,
 313                 "Reduction density %.2f for touched blocks only %.2f\n",
 314                 numBlocksUsed * ThreadForceBuffer<ForceBufferElementType>::s_reductionBlockSize
 315                         / static_cast<double>(numAtoms),
 316                 numBlocksUsed / static_cast<double>(ssize(usedBlockIndices_)));
 317     }
 318 }
 319
 320 template<typename ForceBufferElementType>
 321 void ThreadedForceBuffer<ForceBufferElementType>::reduce(gmx::ForceWithShiftForces* forceWithShiftForces,
 322                                                          real*                      ener,
 323                                                          gmx_grppairener_t*         grpp,
 324                                                          gmx::ArrayRef<real>        dvdl,
 325                                                          const gmx::StepWorkload& stepWork,
 326                                                          const int reductionBeginIndex)
 327 {
 328     if (stepWork.computeForces && !usedBlockIndices_.empty())
 329     {
 330         /* Reduce the force buffer */
 331         GMX_ASSERT(forceWithShiftForces, "Need a valid force buffer for reduction");
 332
 333         reduceThreadForceBuffers<ForceBufferElementType>(
 334                 forceWithShiftForces->force(), threadForceBuffers_, reductionMask_, usedBlockIndices_);
 335     }
 336
 337     const int numBuffers = numThreadBuffers();
 338
 339     /* When necessary, reduce energy and virial using one thread only */
 340     if ((stepWork.computeEnergy || stepWork.computeVirial || stepWork.computeDhdl)
 341         && numBuffers > reductionBeginIndex)
 342     {
 343         gmx::ArrayRef<const std::unique_ptr<ThreadForceBuffer<ForceBufferElementType>>> f_t =
 344                 threadForceBuffers_;
 345
 346         if (stepWork.computeVirial)
 347         {
 348             GMX_ASSERT(forceWithShiftForces, "Need a valid force buffer for reduction");
 349
 350             rvec* gmx_restrict fshift = as_rvec_array(forceWithShiftForces->shiftForces().data());
 351
 352             for (int i = 0; i < gmx::c_numShiftVectors; i++)
 353             {
 354                 for (int t = reductionBeginIndex; t < numBuffers; t++)
 355                 {
 356                     rvec_inc(fshift[i], f_t[t]->shiftForces()[i]);
 357                 }
 358             }
 359         }
 360         if (stepWork.computeEnergy && useEnergyTerms_)
 361         {
 362             GMX_ASSERT(ener, "Need a valid energy buffer for reduction");
 363
 364             for (int i = 0; i < F_NRE; i++)
 365             {
 366                 for (int t = reductionBeginIndex; t < numBuffers; t++)
 367                 {
 368                     ener[i] += f_t[t]->energyTerms()[i];
 369                 }
 370             }
 371         }
 372
 373         if (stepWork.computeEnergy)
 374         {
 375             GMX_ASSERT(grpp, "Need a valid group pair energy buffer for reduction");
 376
 377             for (int i = 0; i < static_cast<int>(NonBondedEnergyTerms::Count); i++)
 378             {
 379                 for (int j = 0; j < f_t[0]->groupPairEnergies().nener; j++)
 380                 {
 381                     for (int t = reductionBeginIndex; t < numBuffers; t++)
 382                     {
 383                         grpp->energyGroupPairTerms[i][j] +=
 384                                 f_t[t]->groupPairEnergies().energyGroupPairTerms[i][j];
 385                     }
 386                 }
 387             }
 388         }
 389         if (stepWork.computeDhdl)
 390         {
 391             GMX_ASSERT(!dvdl.empty(), "Need a valid dV/dl buffer for reduction");
 392
 393             for (auto i : keysOf(f_t[0]->dvdl()))
 394             {
 395
 396                 for (int t = reductionBeginIndex; t < numBuffers; t++)
 397                 {
 398                     dvdl[static_cast<int>(i)] += f_t[t]->dvdl()[i];
 399                 }
 400             }
 401         }
 402     }
 403 }
 404
 405 template class ThreadForceBuffer<RVec>;
 406 template class ThreadedForceBuffer<RVec>;
 407
 408 template class ThreadForceBuffer<rvec4>;
 409 template class ThreadedForceBuffer<rvec4>;
 410
 411 } // namespace gmx