src/gromacs/mdtypes/observablesreducer.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \libinternal \file
  36  * \brief
  37  * Declares gmx::ObservablesReducer and builder
  38  *
  39  * Periodically modules implementing MD simulations need to
  40  * communicate with all collaborating ranks to do things like compute
  41  * observables like total energies, signal conditions, and check
  42  * internal consistency. This communication synchronizes all
  43  * participating ranks, which limits scaling and performance, so it is
  44  * done rarely (typically once per step) and only when required.
  45  *
  46  * Modules may provide data of type double to be reduced across
  47  * all ranks via an MPI all-reduce with MPI_SUM. Double-precision
  48  * floating-point is chosen so that no meaningful precision is lost
  49  * e.g. in computing energies, while also permitting integral or
  50  * boolean messages to be passed as double-precision floating-point
  51  * values.
  52  *
  53  * Different modules typically need to communicate on different MD
  54  * steps, so in principle one might optimize by filling a
  55  * std::vector<double> with the values required on the current
  56  * step. However, that requires that each module produce and then copy
  57  * to the reduction buffer the data for this step. The typical amount
  58  * of data required even if all modules need to participate (ie.
  59  * hundreds of doubles) is smaller than the message headers that are
  60  * used by the underlying network transport protocol. So optimizing
  61  * for minimum message size is not particularly effective because it
  62  * does not meaningfully reduce the total time taken to communicate.
  63  *
  64  * Instead, we always reduce a buffer of the size that would be needed
  65  * if all active modules required communication this step. Then no
  66  * module needs to copy data merely to achieve reduction. To achieve
  67  * this, each module needs a stable view of memory into which it can
  68  * store data for which reduction is desired. It also means that
  69  * modules not active in the current simulation do not contribute to
  70  * the workload at run time. Also, modules that are active but don't
  71  * need communication at any particular MD step can passively opt out
  72  * and that incurs no overhead.
  73  *
  74  * The functionality is separated two main components, one that does
  75  * work during the simulation, and a builder that is used only during
  76  * setup time. This separates the responsibilities of
  77  * - allowing subscription and building the communication buffer, from
  78  * - orchestrating the minimum activity needed for this MD step.
  79  *
  80  * The interaction diagrams for those two workflows are depicted
  81  * below.
  82  *
  83 \msc
  84 wordwraparcs=true,
  85 hscale="2";
  86
  87 runner [label="runner"],
  88 builder [label="builder"],
  89 moduleA [label="moduleA"],
  90 moduleB [label="moduleB"],
  91 observablesReducer [label="observablesReducer"];
  92
  93 runner =>> builder [label="makes"];
  94
  95 runner =>> moduleA [label="makes"];
  96 runner =>> moduleA [label="passes builder to"];
  97 moduleA =>> builder [label="subscribes itself to"];
  98
  99 runner =>> moduleB [label="makes"];
 100 runner =>> moduleB [label="passes builder to"];
 101 moduleB =>> builder [label="subscribes itself to"];
 102
 103 runner =>> builder [label="calls build()"];
 104 builder =>> builder [label="makes communication\nbuffer"];
 105 builder =>> moduleA [label="notifies of\ncallback and view"];
 106 builder =>> moduleB [label="notifies of\ncallback and view"];
 107 builder =>> observablesReducer [label="makes"];
 108
 109 \endmsc
 110
 111 Once the \c observablesReducer is built, the builder may be
 112 destructed.
 113
 114 The \c observablesReducer and its modules operate entirely by
 115 passing callbacks.
 116
 117 \msc
 118 wordwraparcs=true,
 119 hscale="2";
 120
 121 runner [label="runner"],
 122 moduleA [label="moduleA"],
 123 moduleB [label="moduleB"],
 124 observablesReducer [label="observablesReducer"],
 125 compute_globals [label="compute_globals()"];
 126
 127 runner =>> moduleA [label="asks for work"];
 128 moduleA =>> moduleA [label="Produces values\nto reduce"];
 129 moduleA =>> observablesReducer [label="requires reduction from"];
 130
 131 runner =>> moduleB [label="asks for work"];
 132 moduleB =>> moduleB [label="Produces values\nto reduce"];
 133 moduleB =>> observablesReducer [label="requires reduction from"];
 134
 135 runner =>> runner [label="Does other things also"];
 136 runner =>> compute_globals [label="asks to do reduction"];
 137 compute_globals =>> compute_globals [label="prepares data to\nreduce in\nlegacy style"];
 138 compute_globals =>> observablesReducer [label="asks for\nbuffer view"];
 139 observablesReducer =>> compute_globals [label="provides\nbuffer view"];
 140 compute_globals =>> compute_globals [label="Does MPI_Allreduce"];
 141 compute_globals =>> observablesReducer [label="notifies after\nreduction"];
 142 observablesReducer =>> moduleA [label="notifies after reduction"];
 143 moduleA =>> moduleA [label="Uses reduced values"];
 144 moduleA =>> observablesReducer [label="returns"];
 145 observablesReducer =>> moduleB [label="notifies after reduction"];
 146 moduleB =>> moduleB [label="Uses reduced values"];
 147 moduleB =>> observablesReducer [label="returns"];
 148 observablesReducer =>> observablesReducer [label="zeroes reduction buffer"];
 149 observablesReducer =>> compute_globals [label="returns"];
 150
 151 runner =>> observablesReducer [label="notifies at end of step"];
 152
 153
 154 \endmsc
 155  *
 156  * Three callbacks are produced and called per participating module:
 157  *
 158  * 1. One produced by the module and passed to the builder so that
 159  *    later the ObservablesReducer can call it to notify the module
 160  *    that reduction is complete.
 161  * 2. One produced by the builder and returned to the module so the
 162  *    latter can call it to require reduction when it wishes
 163  * 3. One produced by the module and passed to the builder so the
 164  *    latter can call it to notify the former of the buffer view
 165  *    it should use in the first callback and receive a copy
 166  *    of the second callback.
 167  *
 168  * Modules often request that reduction occur "soon" ie. this step or
 169  * next step, depending whether reduction has already take place this
 170  * MD step. However they are also able to request reduction to occur
 171  * "eventually" ie. only whenever some other module requires it, so
 172  * the total number of reductions is minimized. Naturally, the
 173  * callback to such a module happens only after the eventual
 174  * reduction, which may happen on the same step or a later one. If a
 175  * module makes more than one "eventually" reduction request before
 176  * reduction takes place, the callback to that module will be called
 177  * multiple times when eventually reduction does take place. It is the
 178  * responsibility of the module to refrain from making those requests
 179  * if the multiple callbacks would be a problem (e.g. maintain an
 180  * internal record of whether a reduction request has been made).
 181  * Modules are not required to set any value for reduction unless they
 182  * are requesting reduction.
 183  *
 184  * An ObservablesReducer object is intended to replace the use of \c
 185  * compute_globals() by simulations, as
 186  * https://gitlab.com/gromacs/gromacs/-/issues/3887 progresses. When
 187  * no modules using the legacy style communication remain, it is
 188  * anticipated that this class will change to contain an MPI
 189  * communicator to use to implement the MPI_Allreduce internally.  At
 190  * that time, communicationBuffer() and reductionComplete() will
 191  * likely change into a doReduction() method, or similar. The flow of
 192  * the whole propagator loop will now be less clear inasmuch as the
 193  * responsibility for requesting reduction now lies with each module,
 194  * however this is probably still more clear than the large forest of
 195  * flags that resulted from all modules having to have their control
 196  * logic in the propagator loop.
 197  *
 198  * \inlibraryapi
 199  * \ingroup module_mdtypes
 200  */
 201 #ifndef GMX_MDTYPES_OBSERVABLESREDUCER_H
 202 #define GMX_MDTYPES_OBSERVABLESREDUCER_H
 203
 204 #include <cstdint>
 205
 206 #include <functional>
 207 #include <memory>
 208 #include <vector>
 209
 210 namespace gmx
 211 {
 212 template<typename>
 213 class ArrayRef;
 214
 215 class ObservablesReducer;
 216 using Step = int64_t;
 217
 218 /*! \brief Control whether reduction is required soon. */
 219 enum class ReductionRequirement : int
 220 {
 221     //! Reduce whenever the runner next checks with the ObservablesReducer.
 222     Soon,
 223     /*! \brief Reduce whenever the runner next checks with the
 224      * ObservablesReducer after some module requires reduction Soon */
 225     Eventually
 226 };
 227
 228 /*! \brief Report whether the reduction has happened this step */
 229 enum class ObservablesReducerStatus : int
 230 {
 231     //! Reduction has not yet happened this step
 232     ReadyToReduce,
 233     //! Reduction has happened this step
 234     AlreadyReducedThisStep
 235 };
 236
 237 /*! \libinternal \brief
 238  * Builder for ObservablesReducer
 239  *
 240  * Receives subscriptions from MD modules. Caller should call \c
 241  * build() once all subscriptions have been received, and then not
 242  * attempt any further subscriptions or builds. At that time, the
 243  * builder may be destructed.
 244  *
 245  * This builder will
 246  * - receive all subscriptions from MD modules, then
 247  * - build the communication buffer used by the subscribers,
 248  * - build the \c ObservablesReducer object that manages the
 249  *   lifetime of that buffer, and
 250  * - notify the subscribers via callback of the view of that buffer
 251  *   that is theirs to use and a further callback to require
 252  *   reduction of that buffer.
 253  * See also the interaction diagram in the documentation
 254  * for observablesreducer.h file.
 255  *
 256  * Note that the builder callbacks do not follow the approach of \c
 257  * MDModulesNotifier because that requires that the same value is
 258  * passed to all recipients. Here a distinct value goes to each
 259  * recipient, ie. a different view of the communication buffer.
 260  *
 261  * In order to avoid circular build-time dependencies between the
 262  * ObservablesReducer (and its builder) with the modules that use it,
 263  * the latter can directly call methods on the former, supplying
 264  * anonymous callbacks to be used by the former to contact the
 265  * latter. CallbackAfterReduction and CallbackFromBuilder are of this
 266  * type.
 267  *
 268  * A callback type CallBackToRequireReduction is also used instead of
 269  * a direct method call on ObservablesReducer to require reduction.
 270  * This is implemented by calling a method on the Impl object of a
 271  * ObservablesReducer. This extends the interface of
 272  * ObservablesReducer in a way that is not directly visible. That
 273  * complexity provides two benefits:
 274  * - only registered subscribers can require reduction (which helps
 275  *   ensure correctness by construction)
 276  * - the ObservablesReducer::Impl has a stable address from the heap
 277  *   allocation needed for std::unique_ptr to use in forming the
 278  *   callback to request reduction.
 279  * Alternatives exist for the latter, but create requirements on the
 280  * stability of the address of ObservablesReducer, and/or extra
 281  * coordination to only pass that address to subscribers once it is
 282  * stable.
 283  *
 284  * It is the subscribers' responsibility to coordinate so that all
 285  * subscribers on all ranks agree on the need to communicate, e.g. by
 286  * orchestrating communication based on the current step number or a
 287  * previous message.
 288  *
 289  */
 290 class ObservablesReducerBuilder
 291 {
 292 public:
 293     //! Constructor
 294     ObservablesReducerBuilder();
 295     //! Destructor
 296     ~ObservablesReducerBuilder();
 297     //! Move constructor
 298     ObservablesReducerBuilder(ObservablesReducerBuilder&& other) noexcept;
 299     //! Move assignment operator
 300     ObservablesReducerBuilder& operator=(ObservablesReducerBuilder&& other) noexcept;
 301
 302     /*! \brief Convenience type for the callback subscribers to
 303      * provide when they require reduction. */
 304     using CallbackAfterReduction = std::function<void(Step)>;
 305     /*! \brief Convenience type for the callback subscribers
 306      * call to require reduction.
 307      *
 308      * When called, the status it returns can be used for checking the
 309      * internal expectations of the subscriber on whether reduction
 310      * has already occured this step, or not. */
 311     using CallbackToRequireReduction = std::function<ObservablesReducerStatus(ReductionRequirement)>;
 312     /*! \brief Convenience type for the callback from the builder to
 313      * notify the subscribers of the callback they will own and later
 314      * use to require reduction and the view of the communication
 315      * buffer they will later use. */
 316     using CallbackFromBuilder = std::function<void(CallbackToRequireReduction&&, ArrayRef<double>)>;
 317
 318     /*! \brief Add a subscriber to the \c ObservablesReducer that will
 319      * later be built in \c build()
 320      *
 321      * Takes ownership of both callbacks supplied by the subscribing
 322      * module. This approach ensures that time is not spent in the MD
 323      * loop constructing std::function objects, because constructing
 324      * one of those requires 1-2 heap allocations (depending on the
 325      * size of the lambda capture).
 326      *
 327      * Must not be called after build() */
 328     void addSubscriber(int                      sizeRequired,
 329                        CallbackFromBuilder&&    callbackFromBuilder,
 330                        CallbackAfterReduction&& callbackAfterReduction);
 331
 332     /*! \brief Build a \c ObservablesReducer to which any subscribers
 333      * have been added
 334      *
 335      * Must be called only once. Notifies each subscriber (via the
 336      * CallbackFromBuilder that it supplied) of the view of the
 337      * reduction buffer that they will use and the
 338      * CallbackToRequireReduction that they will use. */
 339     ObservablesReducer build();
 340
 341 private:
 342     class Impl;
 343     //! Impl object
 344     std::unique_ptr<Impl> impl_;
 345 };
 346
 347 /*! \libinternal \brief
 348  * Manage reduction of observables for registered subscribers
 349  *
 350  * Modules can require that the \c ObservablesReducer object to which
 351  * they have subscribed do communication this step.  After reduction
 352  * is complete, notifications are made to the callbacks that modules
 353  * previously supplied to the ObservablesReducerBuilder. Then the
 354  * reduction buffer is zeroed. Thus the subscribers may not depend on
 355  * the values in their buffer view after the notification callback
 356  * returns, so they should do any necessary processing during that
 357  * callback.
 358  *
 359  * Modules are free to request reduction whenever they wish, and have
 360  * no obligations to do anything at any time. In particular, they
 361  * do not have to set values for their reduction buffer except when
 362  * they are requesting reduction.
 363  *
 364  * The \c ObservablesReducerBuilder object is responsible for
 365  * preparing a vector of doubles and notifying the subscribers of the
 366  * mutually disjoint views of the buffer that they should use for both
 367  * input and output of the reduction. The ObservablesReducer object
 368  * that it builds retains no record of the subscribers, because its
 369  * responsibility is solely to orchestrate the MPI communication and
 370  * callbacks.
 371  *
 372  * Subscribers automatically use the correct \c ObservablesReducer
 373  * object because the callback they received is bound to the correct
 374  * one. The only way a module can participate in an \c
 375  * ObservablesReducer is to have registered with its builder.
 376  *
 377  * The owner of an ObservablesReducer must maintain the lifetime of
 378  * the \c ObservablesReducer object until all subscribers no longer
 379  * need it. After the destruction of an \c ObservablesReducer, if
 380  * subscribers access their view of the communication buffer, the
 381  * behavior is undefined.
 382  *
 383  * \inlibraryapi
 384  * \ingroup module_mdtypes
 385  */
 386 class ObservablesReducer
 387 {
 388 private:
 389     class Impl;
 390     std::unique_ptr<Impl> impl_;
 391
 392 public:
 393     //! Constructor only usable by ObservablesReducerBuilder
 394     explicit ObservablesReducer(std::unique_ptr<Impl> impl);
 395     // Destructor
 396     ~ObservablesReducer();
 397     //! Move constructor
 398     ObservablesReducer(ObservablesReducer&& other) noexcept;
 399     //! Move assignment operator
 400     ObservablesReducer& operator=(ObservablesReducer&& other) noexcept;
 401
 402     /*! \brief Provide view of communication buffer for MPI reduction
 403      *
 404      * If no subscriber used ReductionRequirement::Soon since the last
 405      * call to reductionComplete(), then this method returns an empty
 406      * buffer. Otherwise it returns a view over the buffer potentially
 407      * filled by all subscribed modules.
 408      */
 409     ArrayRef<double> communicationBuffer();
 410     /*! \brief Called by the runner after MPI communication is complete
 411      *
 412      * Notifies all subscribers who required reduction since the last
 413      * call to reductionComplete() and passes the \c step value so
 414      * they can check internally that the simulation state is
 415      * consistent.
 416      *
 417      * After all notifications, zeroes the communication buffer. It is
 418      * the responsibility of the subscribers that required reduction
 419      * to react suitably to the data available during their
 420      * notification. This ensures that modules cannot get arbitrary
 421      * but realistic-looking values left behind from previous
 422      * communication stages. It also ensures that subsequent
 423      * communication stages will not be able to keep reducing values
 424      * until they overflow or underflow. This zeroing is most efficient
 425      * to do centrally in an object of this class.
 426      *
 427      * The choice of zero for the sentinel value is not perfect. In
 428      * principle, a value of zero is potentially significant to any
 429      * subscriber, so could be provided to a subscriber as the result
 430      * of an incorrect implementation of ObservablesReducer or
 431      * inconsistent use by subscribers. However by construction (which
 432      * is tested), the integration tests never produce a zero result
 433      * from an reduced value provided by a subscriber. So, if the
 434      * coverage there is high then there is good reason to expect that
 435      * when a zero value is used by a subscribers it is the result of
 436      * a reduction and thus significant, rather than an artefact of
 437      * the zeroing of the communication buffer after notifications are
 438      * complete.
 439      *
 440      * The choice of zero ensures that the MPI reduction will produce
 441      * a valid numerical result in all cases except when a module that
 442      * required reduction set buffer contents that produced a
 443      * problematic output after reduction.
 444      */
 445     void reductionComplete(Step step);
 446     /*! \brief Notify the ObservablesReducer that it should make
 447      * ready to receive new values to reduce
 448      *
 449      * Any runner using the ObservablesReducer must call this method
 450      * whenever a step completes, so that subscribed modules can use
 451      * that information to check whether reduction is happening on the
 452      * step that they expect.
 453      *
 454      * The ObservablesReducer keeps track of whether reduction has
 455      * already occured this step, so that when modules request
 456      * reduction it can notify them of that status. This permits them
 457      * to check their own requirements, e.g. that
 458      * ReductionRequirement::Soon will operate this step or next
 459      * step.
 460      *
 461      * For the same reason, it is also necessary to call this method
 462      * at a suitable point after uses of an ObservablesReducer before
 463      * the regular steps of a runner. */
 464     void markAsReadyToReduce();
 465     //! The builder needs to be able to make the Impl object
 466     friend class ObservablesReducerBuilder;
 467 };
 468
 469 } // namespace gmx
 470
 471 #endif