2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * This file defines functions for setting up kernel benchmarks
40 * \author Berk Hess <hess@kth.se>
41 * \ingroup module_nbnxm
46 #include "bench_setup.h"
50 #include "gromacs/gmxlib/nrnb.h"
51 #include "gromacs/math/units.h"
52 #include "gromacs/math/vectypes.h"
53 #include "gromacs/mdlib/dispersioncorrection.h"
54 #include "gromacs/mdlib/force_flags.h"
55 #include "gromacs/mdlib/forcerec.h"
56 #include "gromacs/mdlib/gmx_omp_nthreads.h"
57 #include "gromacs/mdtypes/enerdata.h"
58 #include "gromacs/mdtypes/forcerec.h"
59 #include "gromacs/mdtypes/interaction_const.h"
60 #include "gromacs/mdtypes/mdatom.h"
61 #include "gromacs/mdtypes/simulation_workload.h"
62 #include "gromacs/nbnxm/atomdata.h"
63 #include "gromacs/nbnxm/gridset.h"
64 #include "gromacs/nbnxm/nbnxm.h"
65 #include "gromacs/nbnxm/nbnxm_simd.h"
66 #include "gromacs/nbnxm/pairlistset.h"
67 #include "gromacs/nbnxm/pairlistsets.h"
68 #include "gromacs/nbnxm/pairsearch.h"
69 #include "gromacs/pbcutil/ishift.h"
70 #include "gromacs/pbcutil/pbc.h"
71 #include "gromacs/simd/simd.h"
72 #include "gromacs/timing/cyclecounter.h"
73 #include "gromacs/utility/enumerationhelpers.h"
74 #include "gromacs/utility/fatalerror.h"
75 #include "gromacs/utility/logger.h"
77 #include "bench_system.h"
82 /*! \brief Checks the kernel setup
84 * Returns an error string when the kernel is not available.
86 static std::optional<std::string> checkKernelSetup(const KernelBenchOptions& options)
88 GMX_RELEASE_ASSERT(options.nbnxmSimd < BenchMarkKernels::Count
89 && options.nbnxmSimd != BenchMarkKernels::SimdAuto,
90 "Need a valid kernel SIMD type");
93 if ((options.nbnxmSimd != BenchMarkKernels::SimdNo && !GMX_SIMD)
94 #ifndef GMX_NBNXN_SIMD_4XN
95 || options.nbnxmSimd == BenchMarkKernels::Simd4XM
97 #ifndef GMX_NBNXN_SIMD_2XNN
98 || options.nbnxmSimd == BenchMarkKernels::Simd2XMM
102 return "the requested SIMD kernel was not set up at configuration time";
105 if (options.reportTime && (0 > gmx_cycles_calibrate(1.0)))
107 return "the -time option is not supported on this system";
113 //! Helper to translate between the different enumeration values.
114 static KernelType translateBenchmarkEnum(const BenchMarkKernels& kernel)
116 int kernelInt = static_cast<int>(kernel);
117 return static_cast<KernelType>(kernelInt);
120 /*! \brief Returns the kernel setup
122 static KernelSetup getKernelSetup(const KernelBenchOptions& options)
124 auto messageWhenInvalid = checkKernelSetup(options);
125 GMX_RELEASE_ASSERT(!messageWhenInvalid, "Need valid options");
127 KernelSetup kernelSetup;
129 // The int enum options.nbnxnSimd is set up to match KernelType + 1
130 kernelSetup.kernelType = translateBenchmarkEnum(options.nbnxmSimd);
131 // The plain-C kernel does not support analytical ewald correction
132 if (kernelSetup.kernelType == KernelType::Cpu4x4_PlainC)
134 kernelSetup.ewaldExclusionType = EwaldExclusionType::Table;
138 kernelSetup.ewaldExclusionType = options.useTabulatedEwaldCorr ? EwaldExclusionType::Table
139 : EwaldExclusionType::Analytical;
145 //! Return an interaction constants struct with members used in the benchmark set appropriately
146 static interaction_const_t setupInteractionConst(const KernelBenchOptions& options)
149 interaction_const_t ic;
151 ic.vdwtype = VanDerWaalsType::Cut;
152 ic.vdw_modifier = InteractionModifiers::PotShift;
153 ic.rvdw = options.pairlistCutoff;
155 ic.eeltype = (options.coulombType == BenchMarkCoulomb::Pme ? CoulombInteractionType::Pme
156 : CoulombInteractionType::RF);
157 ic.coulomb_modifier = InteractionModifiers::PotShift;
158 ic.rcoulomb = options.pairlistCutoff;
160 // Reaction-field with reactionFieldPermitivity=inf
161 // TODO: Replace by calc_rffac() after refactoring that
162 ic.reactionFieldCoefficient = 0.5 * std::pow(ic.rcoulomb, -3);
163 ic.reactionFieldShift = 1 / ic.rcoulomb + ic.reactionFieldCoefficient * ic.rcoulomb * ic.rcoulomb;
165 if (EEL_PME_EWALD(ic.eeltype))
167 // Ewald coefficients, we ignore the potential shift
168 GMX_RELEASE_ASSERT(options.ewaldcoeff_q > 0, "Ewald coefficient should be > 0");
169 ic.ewaldcoeff_q = options.ewaldcoeff_q;
170 ic.coulombEwaldTables = std::make_unique<EwaldCorrectionTables>();
171 init_interaction_const_tables(nullptr, &ic, 0, 0);
177 //! Sets up and returns a Nbnxm object for the given benchmark options and system
178 static std::unique_ptr<nonbonded_verlet_t> setupNbnxmForBenchInstance(const KernelBenchOptions& options,
179 const gmx::BenchmarkSystem& system)
181 const auto pinPolicy = (options.useGpu ? gmx::PinningPolicy::PinnedIfSupported
182 : gmx::PinningPolicy::CannotBePinned);
183 const int numThreads = options.numThreads;
184 // Note: the options and Nbnxm combination rule enums values should match
185 const int combinationRule = static_cast<int>(options.ljCombinationRule);
187 auto messageWhenInvalid = checkKernelSetup(options);
188 if (messageWhenInvalid)
190 gmx_fatal(FARGS, "Requested kernel is unavailable because %s.", messageWhenInvalid->c_str());
192 Nbnxm::KernelSetup kernelSetup = getKernelSetup(options);
194 PairlistParams pairlistParams(kernelSetup.kernelType, false, options.pairlistCutoff, false);
197 PbcType::Xyz, false, nullptr, nullptr, pairlistParams.pairlistType, false, numThreads, pinPolicy);
199 auto pairlistSets = std::make_unique<PairlistSets>(pairlistParams, false, 0);
201 auto pairSearch = std::make_unique<PairSearch>(
202 PbcType::Xyz, false, nullptr, nullptr, pairlistParams.pairlistType, false, numThreads, pinPolicy);
204 auto atomData = std::make_unique<nbnxn_atomdata_t>(pinPolicy,
206 kernelSetup.kernelType,
209 system.nonbondedParameters,
213 // Put everything together
214 auto nbv = std::make_unique<nonbonded_verlet_t>(
215 std::move(pairlistSets), std::move(pairSearch), std::move(atomData), kernelSetup, nullptr, nullptr);
219 GMX_RELEASE_ASSERT(!TRICLINIC(system.box), "Only rectangular unit-cells are supported here");
220 const rvec lowerCorner = { 0, 0, 0 };
221 const rvec upperCorner = { system.box[XX][XX], system.box[YY][YY], system.box[ZZ][ZZ] };
223 gmx::ArrayRef<const int> atomInfo;
224 if (options.useHalfLJOptimization)
226 atomInfo = system.atomInfoOxygenVdw;
230 atomInfo = system.atomInfoAllVdw;
233 const real atomDensity = system.coordinates.size() / det(system.box);
235 nbnxn_put_on_grid(nbv.get(),
241 { 0, int(system.coordinates.size()) },
248 nbv->constructPairlist(gmx::InteractionLocality::Local, system.excls, 0, &nrnb);
250 nbv->setAtomProperties(system.atomTypes, system.charges, atomInfo);
255 //! Add the options instance to the list for all requested kernel SIMD types
256 static void expandSimdOptionAndPushBack(const KernelBenchOptions& options,
257 std::vector<KernelBenchOptions>* optionsList)
259 if (options.nbnxmSimd == BenchMarkKernels::SimdAuto)
261 bool addedInstance = false;
262 #ifdef GMX_NBNXN_SIMD_4XN
263 optionsList->push_back(options);
264 optionsList->back().nbnxmSimd = BenchMarkKernels::Simd4XM;
265 addedInstance = true;
267 #ifdef GMX_NBNXN_SIMD_2XNN
268 optionsList->push_back(options);
269 optionsList->back().nbnxmSimd = BenchMarkKernels::Simd2XMM;
270 addedInstance = true;
274 optionsList->push_back(options);
275 optionsList->back().nbnxmSimd = BenchMarkKernels::SimdNo;
280 optionsList->push_back(options);
284 //! Sets up and runs the requested benchmark instance and prints the results
286 // When \p doWarmup is true runs the warmup iterations instead
287 // of the normal ones and does not print any results
288 static void setupAndRunInstance(const gmx::BenchmarkSystem& system,
289 const KernelBenchOptions& options,
292 // Generate an, accurate, estimate of the number of non-zero pair interactions
293 const real atomDensity = system.coordinates.size() / det(system.box);
294 const real numPairsWithinCutoff =
295 atomDensity * 4.0 / 3.0 * M_PI * std::pow(options.pairlistCutoff, 3);
296 const real numUsefulPairs = system.coordinates.size() * 0.5 * (numPairsWithinCutoff + 1);
298 std::unique_ptr<nonbonded_verlet_t> nbv = setupNbnxmForBenchInstance(options, system);
300 // We set the interaction cut-off to the pairlist cut-off
301 interaction_const_t ic = setupInteractionConst(options);
305 gmx_enerdata_t enerd(1, 0);
307 gmx::StepWorkload stepWork;
308 stepWork.computeForces = true;
309 if (options.computeVirialAndEnergy)
311 stepWork.computeVirial = true;
312 stepWork.computeEnergy = true;
315 const gmx::EnumerationArray<BenchMarkKernels, std::string> kernelNames = {
316 "auto", "no", "4xM", "2xMM"
319 const gmx::EnumerationArray<BenchMarkCombRule, std::string> combruleNames = { "geom.",
326 "%-7s %-4s %-5s %-4s ",
327 options.coulombType == BenchMarkCoulomb::Pme ? "Ewald" : "RF",
328 options.useHalfLJOptimization ? "half" : "all",
329 combruleNames[options.ljCombinationRule].c_str(),
330 kernelNames[options.nbnxmSimd].c_str());
331 if (!options.outputFile.empty())
334 "\"%d\",\"%zu\",\"%g\",\"%d\",\"%d\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%"
337 (options.nbnxmSimd != BenchMarkKernels::SimdNo) ? GMX_SIMD_REAL_WIDTH : 0,
341 system.coordinates.size(),
342 options.pairlistCutoff,
344 options.numIterations,
345 options.computeVirialAndEnergy ? "yes" : "no",
346 (options.coulombType != BenchMarkCoulomb::ReactionField)
347 ? ((options.nbnxmSimd == BenchMarkKernels::SimdNo || options.useTabulatedEwaldCorr)
351 options.coulombType == BenchMarkCoulomb::Pme ? "Ewald" : "RF",
352 options.useHalfLJOptimization ? "half" : "all",
353 combruleNames[options.ljCombinationRule].c_str(),
354 kernelNames[options.nbnxmSimd].c_str());
358 // Run pre-iteration to avoid cache misses
359 for (int iter = 0; iter < options.numPreIterations; iter++)
361 nbv->dispatchNonbondedKernel(
362 gmx::InteractionLocality::Local,
366 system.forceRec.shift_vec,
367 enerd.grpp.energyGroupPairTerms[system.forceRec.haveBuckingham ? NonBondedEnergyTerms::BuckinghamSR
368 : NonBondedEnergyTerms::LJSR],
369 enerd.grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR],
373 const int numIterations = (doWarmup ? options.numWarmupIterations : options.numIterations);
374 const PairlistSet& pairlistSet = nbv->pairlistSets().pairlistSet(gmx::InteractionLocality::Local);
375 const gmx::index numPairs = pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_ + pairlistSet.natpair_q_;
376 gmx_cycles_t cycles = gmx_cycles_read();
377 for (int iter = 0; iter < numIterations; iter++)
379 // Run the kernel without force clearing
380 nbv->dispatchNonbondedKernel(
381 gmx::InteractionLocality::Local,
385 system.forceRec.shift_vec,
386 enerd.grpp.energyGroupPairTerms[system.forceRec.haveBuckingham ? NonBondedEnergyTerms::BuckinghamSR
387 : NonBondedEnergyTerms::LJSR],
388 enerd.grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR],
391 cycles = gmx_cycles_read() - cycles;
394 if (options.reportTime)
396 const double uSec = static_cast<double>(cycles) * gmx_cycles_calibrate(1.0) * 1.e6;
397 if (options.cyclesPerPair)
400 "%13.2f %13.3f %10.3f %10.3f\n",
402 uSec / options.numIterations,
403 uSec / (options.numIterations * numPairs),
404 uSec / (options.numIterations * numUsefulPairs));
405 if (!options.outputFile.empty())
408 "\"%.3f\",\"%.4f\",\"%.4f\",\"%.4f\"\n",
410 uSec / options.numIterations,
411 uSec / (options.numIterations * numPairs),
412 uSec / (options.numIterations * numUsefulPairs));
418 "%13.2f %13.3f %10.3f %10.3f\n",
420 uSec / options.numIterations,
421 options.numIterations * numPairs / uSec,
422 options.numIterations * numUsefulPairs / uSec);
423 if (!options.outputFile.empty())
426 "\"%.3f\",\"%.4f\",\"%.4f\",\"%.4f\"\n",
428 uSec / options.numIterations,
429 options.numIterations * numPairs / uSec,
430 options.numIterations * numUsefulPairs / uSec);
436 const double dCycles = static_cast<double>(cycles);
437 if (options.cyclesPerPair)
440 "%10.3f %10.4f %8.4f %8.4f\n",
442 dCycles / options.numIterations * 1e-6,
443 dCycles / (options.numIterations * numPairs),
444 dCycles / (options.numIterations * numUsefulPairs));
449 "%10.3f %10.4f %8.4f %8.4f\n",
451 dCycles / options.numIterations * 1e-6,
452 options.numIterations * numPairs / dCycles,
453 options.numIterations * numUsefulPairs / dCycles);
459 void bench(const int sizeFactor, const KernelBenchOptions& options)
461 // We don't want to call gmx_omp_nthreads_init(), so we init what we need
462 gmx_omp_nthreads_set(ModuleMultiThread::Pairsearch, options.numThreads);
463 gmx_omp_nthreads_set(ModuleMultiThread::Nonbonded, options.numThreads);
465 const gmx::BenchmarkSystem system(sizeFactor, options.outputFile);
467 real minBoxSize = norm(system.box[XX]);
468 for (int dim = YY; dim < DIM; dim++)
470 minBoxSize = std::min(minBoxSize, norm(system.box[dim]));
472 if (options.pairlistCutoff > 0.5 * minBoxSize)
474 gmx_fatal(FARGS, "The cut-off should be shorter than half the box size");
477 std::vector<KernelBenchOptions> optionsList;
480 KernelBenchOptions opt = options;
481 gmx::EnumerationWrapper<BenchMarkCoulomb> coulombIter;
482 for (auto coulombType : coulombIter)
484 opt.coulombType = coulombType;
485 for (int halfLJ = 0; halfLJ <= 1; halfLJ++)
487 opt.useHalfLJOptimization = (halfLJ == 1);
489 gmx::EnumerationWrapper<BenchMarkCombRule> combRuleIter;
490 for (auto combRule : combRuleIter)
492 opt.ljCombinationRule = combRule;
494 expandSimdOptionAndPushBack(opt, &optionsList);
501 expandSimdOptionAndPushBack(options, &optionsList);
503 GMX_RELEASE_ASSERT(!optionsList.empty(), "Expect at least on benchmark setup");
506 if (options.nbnxmSimd != BenchMarkKernels::SimdNo)
508 fprintf(stdout, "SIMD width: %d\n", GMX_SIMD_REAL_WIDTH);
511 fprintf(stdout, "System size: %zu atoms\n", system.coordinates.size());
512 fprintf(stdout, "Cut-off radius: %g nm\n", options.pairlistCutoff);
513 fprintf(stdout, "Number of threads: %d\n", options.numThreads);
514 fprintf(stdout, "Number of iterations: %d\n", options.numIterations);
515 fprintf(stdout, "Compute energies: %s\n", options.computeVirialAndEnergy ? "yes" : "no");
516 if (options.coulombType != BenchMarkCoulomb::ReactionField)
519 "Ewald excl. corr.: %s\n",
520 options.nbnxmSimd == BenchMarkKernels::SimdNo || options.useTabulatedEwaldCorr
526 if (options.numWarmupIterations > 0)
528 setupAndRunInstance(system, optionsList[0], true);
531 if (options.reportTime)
534 "Coulomb LJ comb. SIMD usec usec/it. %s\n",
535 options.cyclesPerPair ? "usec/pair" : "pairs/usec");
536 if (!options.outputFile.empty())
539 "\"width\",\"atoms\",\"cut-off radius\",\"threads\",\"iter\",\"compute "
540 "energy\",\"Ewald excl. "
541 "corr.\",\"Coulomb\",\"LJ\",\"comb\",\"SIMD\",\"usec\",\"usec/it\",\"total "
542 "pairs/usec\",\"useful pairs/usec\"\n");
550 "Coulomb LJ comb. SIMD Mcycles Mcycles/it. %s\n",
551 options.cyclesPerPair ? "cycles/pair" : "pairs/cycle");
552 if (!options.outputFile.empty())
555 "\"width\",\"atoms\",\"cut-off radius\",\"threads\",\"iter\",\"compute "
556 "energy\",\"Ewald excl. "
557 "corr.\",\"Coulomb\",\"LJ\",\"comb\",\"SIMD\",\"Mcycles\",\"Mcycles/"
559 "total cycles/pair\",\"total cycles per useful pair\"\n");
561 fprintf(stdout, " total useful\n");
564 for (const auto& optionsInstance : optionsList)
566 setupAndRunInstance(system, optionsInstance, false);
569 if (!options.outputFile.empty())