src/programs/mdrun/nonbonded_bench.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2019,2020, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 /*! \internal \file
  36  *
  37  * \brief This file contains the main function for the non-bonded kernel benchmark
  38  *
  39  * \author Berk Hess <hess@kth.se>
  40  */
  41
  42 #include "gmxpre.h"
  43
  44 #include "nonbonded_bench.h"
  45
  46 #include <vector>
  47
  48 #include "gromacs/commandline/cmdlineoptionsmodule.h"
  49 #include "gromacs/ewald/ewald_utils.h"
  50 #include "gromacs/nbnxm/benchmark/bench_setup.h"
  51 #include "gromacs/options/basicoptions.h"
  52 #include "gromacs/options/filenameoption.h"
  53 #include "gromacs/options/ioptionscontainer.h"
  54 #include "gromacs/selection/selectionoptionbehavior.h"
  55 #include "gromacs/utility/arraysize.h"
  56 #include "gromacs/utility/enumerationhelpers.h"
  57
  58 namespace gmx
  59 {
  60
  61
  62 namespace
  63 {
  64
  65 class NonbondedBenchmark : public ICommandLineOptionsModule
  66 {
  67 public:
  68     NonbondedBenchmark() {}
  69
  70     // From ICommandLineOptionsModule
  71     void init(CommandLineModuleSettings* /*settings*/) override {}
  72     void initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings) override;
  73     void optionsFinished() override;
  74     int  run() override;
  75
  76 private:
  77     int                       sizeFactor_ = 1;
  78     Nbnxm::KernelBenchOptions benchmarkOptions_;
  79 };
  80
  81 void NonbondedBenchmark::initOptions(IOptionsContainer* options, ICommandLineOptionsModuleSettings* settings)
  82 {
  83     std::vector<const char*> desc = {
  84         "[THISMODULE] runs benchmarks for one or more so-called Nbnxm",
  85         "non-bonded pair kernels. The non-bonded pair kernels are",
  86         "the most compute intensive part of MD simulations",
  87         "and usually comprise 60 to 90 percent of the runtime.",
  88         "For this reason they are highly optimized and several different",
  89         "setups are available to compute the same physical interactions.",
  90         "In addition, there are different physical treatments of Coulomb",
  91         "interactions and optimizations for atoms without Lennard-Jones",
  92         "interactions. There are also different physical treatments of",
  93         "Lennard-Jones interactions, but only a plain cut-off is supported",
  94         "in this tool, as that is by far the most common treatment.",
  95         "And finally, while force output is always necessary, energy output",
  96         "is only required at certain steps. In total there are",
  97         "12 relevant combinations of options. The combinations double to 24",
  98         "when two different SIMD setups are supported. These combinations",
  99         "can be run with a single invocation using the [TT]-all[tt] option.",
 100         "The behavior of each kernel is affected by caching behavior,",
 101         "which is determined by the hardware used together with the system size",
 102         "and the cut-off radius. The larger the number of atoms per thread,",
 103         "the more L1 cache is needed to avoid L1 cache misses.",
 104         "The cut-off radius mainly affects the data reuse: a larger cut-off",
 105         "results in more data reuse and makes the kernel less sensitive to cache",
 106         "misses.[PAR]",
 107         "OpenMP parallelization is used to utilize multiple hardware threads",
 108         "within a compute node. In these benchmarks there is no interaction",
 109         "between threads, apart from starting and closing a single OpenMP",
 110         "parallel region per iteration. Additionally, threads interact",
 111         "through sharing and evicting data from shared caches.",
 112         "The number of threads to use is set with the [TT]-nt[tt] option.",
 113         "Thread affinity is important, especially with SMT and shared",
 114         "caches. Affinities can be set through the OpenMP library using",
 115         "the GOMP_CPU_AFFINITY environment variable.[PAR]",
 116         "The benchmark tool times one or more kernels by running them",
 117         "repeatedly for a number of iterations set by the [TT]-iter[tt]",
 118         "option. An initial kernel call is done to avoid additional initial",
 119         "cache misses. Times are recording in cycles read from efficient,",
 120         "high accuracy counters in the CPU. Note that these often do not",
 121         "correspond to actual clock cycles. For each kernel, the tool",
 122         "reports the total number of cycles, cycles per iteration,",
 123         "and (total and useful) pair interactions per cycle.",
 124         "Because a cluster pair list is used instead of an atom pair list,",
 125         "interactions are also computed for some atom pairs that are beyond",
 126         "the cut-off distance. These pairs are not useful (except for",
 127         "additional buffering, but that is not of interest here),",
 128         "only a side effect of the cluster-pair setup. The SIMD 2xMM kernel",
 129         "has a higher useful pair ratio then the 4xM kernel due to a smaller",
 130         "cluster size, but a lower total pair throughput.",
 131         "It is best to run this, or for that matter any, benchmark",
 132         "with locked CPU clocks, as thermal throttling can significantly",
 133         "affect performance. If that is not an option, the [TT]-warmup[TT]",
 134         "option can be used to run initial, untimed iterations to warm up",
 135         "the processor.[PAR]",
 136         "The most relevant regime is between 0.1 to 1 millisecond per",
 137         "iteration. Thus it is useful to run with system sizes that cover",
 138         "both ends of this regime.[PAR]",
 139         "The [TT]-simd[tt] and [TT]-table[tt] options select different",
 140         "implementations to compute the same physics. The choice of these",
 141         "options should ideally be optimized for the target hardware.",
 142         "Historically, we only found tabulated Ewald correction to be useful",
 143         "on 2-wide SIMD or 4-wide SIMD without FMA support. As all modern",
 144         "architectures are wider and support FMA, we do not use tables by",
 145         "default. The only exceptions are kernels without SIMD, which only",
 146         "support tables.",
 147         "Options [TT]-coulomb[tt], [TT]-combrule[tt] and [TT]-halflj[tt]",
 148         "depend on the force field and composition of the simulated system.",
 149         "The optimization of computing Lennard-Jones interactions for only",
 150         "half of the atoms in a cluster is useful for water, which does not",
 151         "use Lennard-Jones on hydrogen atoms in most water models.",
 152         "In the MD engine, any clusters where at most half of the atoms",
 153         "have LJ interactions will automatically use this kernel.",
 154         "And finally, the [TT]-energy[tt] option selects the computation",
 155         "of energies, which are usually only needed infrequently."
 156     };
 157
 158     settings->setHelpText(desc);
 159
 160     static const EnumerationArray<Nbnxm::BenchMarkKernels, const char*> c_nbnxmSimdStrings = {
 161         { "auto", "no", "4xm", "2xmm" }
 162     };
 163     static const EnumerationArray<Nbnxm::BenchMarkCombRule, const char*> c_combRuleStrings = {
 164         { "geometric", "lb", "none" }
 165     };
 166     static const EnumerationArray<Nbnxm::BenchMarkCoulomb, const char*> c_coulombTypeStrings = {
 167         { "ewald", "reaction-field" }
 168     };
 169
 170     options->addOption(
 171             IntegerOption("size").store(&sizeFactor_).description("The system size is 3000 atoms times this value"));
 172     options->addOption(
 173             IntegerOption("nt").store(&benchmarkOptions_.numThreads).description("The number of OpenMP threads to use"));
 174     options->addOption(EnumOption<Nbnxm::BenchMarkKernels>("simd")
 175                                .store(&benchmarkOptions_.nbnxmSimd)
 176                                .enumValue(c_nbnxmSimdStrings)
 177                                .description("SIMD type, auto runs all supported SIMD setups or no "
 178                                             "SIMD when SIMD is not supported"));
 179     options->addOption(EnumOption<Nbnxm::BenchMarkCoulomb>("coulomb")
 180                                .store(&benchmarkOptions_.coulombType)
 181                                .enumValue(c_coulombTypeStrings)
 182                                .description("The functional form for the Coulomb interactions"));
 183     options->addOption(
 184             BooleanOption("table")
 185                     .store(&benchmarkOptions_.useTabulatedEwaldCorr)
 186                     .description("Use lookup table for Ewald correction instead of analytical"));
 187     options->addOption(EnumOption<Nbnxm::BenchMarkCombRule>("combrule")
 188                                .store(&benchmarkOptions_.ljCombinationRule)
 189                                .enumValue(c_combRuleStrings)
 190                                .description("The LJ combination rule"));
 191     options->addOption(BooleanOption("halflj")
 192                                .store(&benchmarkOptions_.useHalfLJOptimization)
 193                                .description("Use optimization for LJ on half of the atoms"));
 194     options->addOption(BooleanOption("energy")
 195                                .store(&benchmarkOptions_.computeVirialAndEnergy)
 196                                .description("Compute energies in addition to forces"));
 197     options->addOption(
 198             BooleanOption("all").store(&benchmarkOptions_.doAll).description("Run all 12 combinations of options for coulomb, halflj, combrule"));
 199     options->addOption(RealOption("cutoff")
 200                                .store(&benchmarkOptions_.pairlistCutoff)
 201                                .description("Pair-list and interaction cut-off distance"));
 202     options->addOption(IntegerOption("iter")
 203                                .store(&benchmarkOptions_.numIterations)
 204                                .description("The number of iterations for each kernel"));
 205     options->addOption(IntegerOption("warmup")
 206                                .store(&benchmarkOptions_.numWarmupIterations)
 207                                .description("The number of iterations for initial warmup"));
 208     options->addOption(BooleanOption("cycles")
 209                                .store(&benchmarkOptions_.cyclesPerPair)
 210                                .description("Report cycles/pair instead of pairs/cycle"));
 211 }
 212
 213 void NonbondedBenchmark::optionsFinished()
 214 {
 215     // We compute the Ewald coefficient here to avoid a dependency of the Nbnxm on the Ewald module
 216     const real ewald_rtol          = 1e-5;
 217     benchmarkOptions_.ewaldcoeff_q = calc_ewaldcoeff_q(benchmarkOptions_.pairlistCutoff, ewald_rtol);
 218 }
 219
 220 int NonbondedBenchmark::run()
 221 {
 222     Nbnxm::bench(sizeFactor_, benchmarkOptions_);
 223
 224     return 0;
 225 }
 226
 227 } // namespace
 228
 229 const char NonbondedBenchmarkInfo::name[] = "nonbonded-benchmark";
 230 const char NonbondedBenchmarkInfo::shortDescription[] =
 231         "Benchmarking tool for the non-bonded pair kernels.";
 232
 233 ICommandLineOptionsModulePointer NonbondedBenchmarkInfo::create()
 234 {
 235     return ICommandLineOptionsModulePointer(std::make_unique<NonbondedBenchmark>());
 236 }
 237
 238 } // namespace gmx