timing: try to directly retrieve gmx_cycles_calibrate() from the hardware
authorGilles Gouaillardet <gilles@rist.or.jp>
Tue, 3 Nov 2020 11:23:27 +0000 (11:23 +0000)
committerPaul Bauer <paul.bauer.q@gmail.com>
Tue, 3 Nov 2020 11:23:27 +0000 (11:23 +0000)
The timer period can be directly retrieved from the hardware on ARMv8
and recent x86_64 platforms.
Try this first, and fall back on the current guessestimate on older x86_64 platforms

src/gromacs/nbnxm/benchmark/bench_setup.cpp
src/gromacs/nbnxm/benchmark/bench_setup.h
src/gromacs/nbnxm/benchmark/bench_system.cpp
src/gromacs/nbnxm/benchmark/bench_system.h
src/gromacs/timing/cyclecounter.cpp
src/programs/mdrun/nonbonded_bench.cpp

index ee4427e9b255438108ffeec00ac835afecd1d395..9fffb257ea21f4f4db102c79b1b4d7e1d225fba8 100644 (file)
@@ -100,6 +100,11 @@ static std::optional<std::string> checkKernelSetup(const KernelBenchOptions& opt
         return "the requested SIMD kernel was not set up at configuration time";
     }
 
+    if (options.reportTime && (0 > gmx_cycles_calibrate(1.0)))
+    {
+        return "the -time option is not supported on this system";
+    }
+
     return {};
 }
 
@@ -304,6 +309,28 @@ static void setupAndRunInstance(const gmx::BenchmarkSystem& system,
                 options.coulombType == BenchMarkCoulomb::Pme ? "Ewald" : "RF",
                 options.useHalfLJOptimization ? "half" : "all",
                 combruleNames[options.ljCombinationRule].c_str(), kernelNames[options.nbnxmSimd].c_str());
+        if (!options.outputFile.empty())
+        {
+            fprintf(system.csv,
+                    "\"%d\",\"%zu\",\"%g\",\"%d\",\"%d\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%"
+                    "s\",",
+#if GMX_SIMD
+                    (options.nbnxmSimd != BenchMarkKernels::SimdNo) ? GMX_SIMD_REAL_WIDTH : 0,
+#else
+                    0,
+#endif
+                    system.coordinates.size(), options.pairlistCutoff, options.numThreads,
+                    options.numIterations, options.computeVirialAndEnergy ? "yes" : "no",
+                    (options.coulombType != BenchMarkCoulomb::ReactionField)
+                            ? ((options.nbnxmSimd == BenchMarkKernels::SimdNo || options.useTabulatedEwaldCorr)
+                                       ? "table"
+                                       : "analytical")
+                            : "",
+                    options.coulombType == BenchMarkCoulomb::Pme ? "Ewald" : "RF",
+                    options.useHalfLJOptimization ? "half" : "all",
+                    combruleNames[options.ljCombinationRule].c_str(),
+                    kernelNames[options.nbnxmSimd].c_str());
+        }
     }
 
     // Run pre-iteration to avoid cache misses
@@ -326,18 +353,50 @@ static void setupAndRunInstance(const gmx::BenchmarkSystem& system,
     cycles = gmx_cycles_read() - cycles;
     if (!doWarmup)
     {
-        const double dCycles = static_cast<double>(cycles);
-        if (options.cyclesPerPair)
+        if (options.reportTime)
         {
-            fprintf(stdout, "%10.3f %10.4f %8.4f %8.4f\n", cycles * 1e-6,
-                    dCycles / options.numIterations * 1e-6, dCycles / (options.numIterations * numPairs),
-                    dCycles / (options.numIterations * numUsefulPairs));
+            const double uSec = static_cast<double>(cycles) * gmx_cycles_calibrate(1.0) * 1.e6;
+            if (options.cyclesPerPair)
+            {
+                fprintf(stdout, "%13.2f %13.3f %10.3f %10.3f\n", uSec, uSec / options.numIterations,
+                        uSec / (options.numIterations * numPairs),
+                        uSec / (options.numIterations * numUsefulPairs));
+                if (!options.outputFile.empty())
+                {
+                    fprintf(system.csv, "\"%.3f\",\"%.4f\",\"%.4f\",\"%.4f\"\n", uSec,
+                            uSec / options.numIterations, uSec / (options.numIterations * numPairs),
+                            uSec / (options.numIterations * numUsefulPairs));
+                }
+            }
+            else
+            {
+                fprintf(stdout, "%13.2f %13.3f %10.3f %10.3f\n", uSec, uSec / options.numIterations,
+                        options.numIterations * numPairs / uSec,
+                        options.numIterations * numUsefulPairs / uSec);
+                if (!options.outputFile.empty())
+                {
+                    fprintf(system.csv, "\"%.3f\",\"%.4f\",\"%.4f\",\"%.4f\"\n", uSec,
+                            uSec / options.numIterations, options.numIterations * numPairs / uSec,
+                            options.numIterations * numUsefulPairs / uSec);
+                }
+            }
         }
         else
         {
-            fprintf(stdout, "%10.3f %10.4f %8.4f %8.4f\n", dCycles * 1e-6,
-                    dCycles / options.numIterations * 1e-6, options.numIterations * numPairs / dCycles,
-                    options.numIterations * numUsefulPairs / dCycles);
+            const double dCycles = static_cast<double>(cycles);
+            if (options.cyclesPerPair)
+            {
+                fprintf(stdout, "%10.3f %10.4f %8.4f %8.4f\n", cycles * 1e-6,
+                        dCycles / options.numIterations * 1e-6,
+                        dCycles / (options.numIterations * numPairs),
+                        dCycles / (options.numIterations * numUsefulPairs));
+            }
+            else
+            {
+                fprintf(stdout, "%10.3f %10.4f %8.4f %8.4f\n", dCycles * 1e-6,
+                        dCycles / options.numIterations * 1e-6, options.numIterations * numPairs / dCycles,
+                        options.numIterations * numUsefulPairs / dCycles);
+            }
         }
     }
 }
@@ -348,7 +407,7 @@ void bench(const int sizeFactor, const KernelBenchOptions& options)
     gmx_omp_nthreads_set(emntPairsearch, options.numThreads);
     gmx_omp_nthreads_set(emntNonbonded, options.numThreads);
 
-    const gmx::BenchmarkSystem system(sizeFactor);
+    const gmx::BenchmarkSystem system(sizeFactor, options.outputFile);
 
     real minBoxSize = norm(system.box[XX]);
     for (int dim = YY; dim < DIM; dim++)
@@ -413,14 +472,46 @@ void bench(const int sizeFactor, const KernelBenchOptions& options)
         setupAndRunInstance(system, optionsList[0], true);
     }
 
-    fprintf(stdout, "Coulomb LJ   comb. SIMD    Mcycles  Mcycles/it.   %s\n",
-            options.cyclesPerPair ? "cycles/pair" : "pairs/cycle");
-    fprintf(stdout, "                                                total    useful\n");
+    if (options.reportTime)
+    {
+        fprintf(stdout, "Coulomb LJ   comb. SIMD       usec         usec/it.        %s\n",
+                options.cyclesPerPair ? "usec/pair" : "pairs/usec");
+        if (!options.outputFile.empty())
+        {
+            fprintf(system.csv,
+                    "\"width\",\"atoms\",\"cut-off radius\",\"threads\",\"iter\",\"compute "
+                    "energy\",\"Ewald excl. "
+                    "corr.\",\"Coulomb\",\"LJ\",\"comb\",\"SIMD\",\"usec\",\"usec/it\",\"total "
+                    "pairs/usec\",\"useful pairs/usec\"\n");
+        }
+        fprintf(stdout,
+                "                                                        total      useful\n");
+    }
+    else
+    {
+        fprintf(stdout, "Coulomb LJ   comb. SIMD    Mcycles  Mcycles/it.   %s\n",
+                options.cyclesPerPair ? "cycles/pair" : "pairs/cycle");
+        if (!options.outputFile.empty())
+        {
+            fprintf(system.csv,
+                    "\"width\",\"atoms\",\"cut-off radius\",\"threads\",\"iter\",\"compute "
+                    "energy\",\"Ewald excl. "
+                    "corr.\",\"Coulomb\",\"LJ\",\"comb\",\"SIMD\",\"Mcycles\",\"Mcycles/"
+                    "it\",\"total "
+                    "total cycles/pair\",\"total cycles per useful pair\"\n");
+        }
+        fprintf(stdout, "                                                total    useful\n");
+    }
 
     for (const auto& optionsInstance : optionsList)
     {
         setupAndRunInstance(system, optionsInstance, false);
     }
+
+    if (!options.outputFile.empty())
+    {
+        fclose(system.csv);
+    }
 }
 
 } // namespace Nbnxm
index 2e33352376579ff74a89a900b12477b2b7d907c2..d7b0f296fb089388254fdd54a50792e878d0ed11 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -46,6 +46,8 @@
 #ifndef GMX_NBNXN_BENCH_SETUP_H
 #define GMX_NBNXN_BENCH_SETUP_H
 
+#include <string>
+
 #include "gromacs/utility/real.h"
 
 namespace Nbnxm
@@ -113,6 +115,10 @@ struct KernelBenchOptions
     int numWarmupIterations = 0;
     //! Print cycles/pair instead of pairs/cycle
     bool cyclesPerPair = false;
+    //! Report in micro seconds instead of cycles
+    bool reportTime = false;
+    //! Also report into a csv file
+    std::string outputFile;
 };
 
 /*! \brief
index 553f4cafd26df838b92570d1e4187bfe4bc5bc98..de4d738969b6e6daeebc94a9805a857c0682fb3f 100644 (file)
@@ -150,7 +150,7 @@ static void generateCoordinates(int multiplicationFactor, std::vector<gmx::RVec>
     }
 }
 
-BenchmarkSystem::BenchmarkSystem(const int multiplicationFactor)
+BenchmarkSystem::BenchmarkSystem(const int multiplicationFactor, const std::string& outputFile)
 {
     numAtomTypes = 2;
     nonbondedParameters.resize(numAtomTypes * numAtomTypes * 2, 0);
@@ -199,6 +199,10 @@ BenchmarkSystem::BenchmarkSystem(const int multiplicationFactor)
     forceRec.nbfp  = nonbondedParameters;
     snew(forceRec.shift_vec, SHIFTS);
     calc_shifts(box, forceRec.shift_vec);
+    if (!outputFile.empty())
+    {
+        csv = fopen(outputFile.c_str(), "w+");
+    }
 }
 
 } // namespace gmx
index adcc85d4ffacc443b36b48dc8040e09ffa2fb1aa..acf02326699b1ba27e85e788e62dd2b58f7b6c10 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -44,6 +44,7 @@
 #ifndef GMX_NBNXN_BENCH_SYSTEM_H
 #define GMX_NBNXN_BENCH_SYSTEM_H
 
+#include <string>
 #include <vector>
 
 #include "gromacs/math/vectypes.h"
@@ -64,8 +65,9 @@ struct BenchmarkSystem
      * with 3000 atoms total.
      *
      * \param[in] multiplicationFactor  Should be a power of 2, is checked
+     * \param[in] outputFile            The name of the csv file to write benchmark results
      */
-    BenchmarkSystem(int multiplicationFactor);
+    BenchmarkSystem(int multiplicationFactor, const std::string& outputFile);
 
     //! Number of different atom types in test system.
     int numAtomTypes;
@@ -87,6 +89,8 @@ struct BenchmarkSystem
     matrix box;
     //! Forcerec with only the entries used in the benchmark set
     t_forcerec forceRec;
+    //! csv output file
+    FILE* csv;
 };
 
 } // namespace gmx
index 6c1a0d43669e1265d9d51efcfc250567fc54df9b..30637ade0c622e6a7a2b40d3e90d5568cff457c9 100644 (file)
  */
 double gmx_cycles_calibrate(double sampletime)
 {
-#ifdef _MSC_VER
+    /* On ARM and recent-generation x86-64, we can use the more accurate cycle counters
+     * that allow better timing for things that depend on it (e.g. load balancing, profiling).
+     */
+#if ((defined __aarch64__) \
+     && (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__) || defined(__PGIC__)))
+    /* 64-bit ARM cycle counters with GCC inline assembly */
+    unsigned long cycles;
+    __asm__ __volatile__("mrs %0, cntfrq_el0" : "=r"(cycles));
+    /* Only first 32 bits are significant */
+    cycles &= 0xFFFFFFFF;
+    return 1. / cycles;
+    GMX_UNUSED_VALUE(sampletime);
+#else
+#    if ((defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__) || defined(__PGIC__)) \
+         && defined(__x86_64__) && !defined(_CRAYC))
+    long gmx_unused tmp;
+    int             cpuid1;
+    int gmx_unused cpuid2;
+    const int      l0  = 0x0;
+    const int      l16 = 0x16;
+    gmx_cycles_t   cycles;
+
+    /* cpuid clobbers ebx but it must be restored for -fPIC so save
+     * then restore ebx */
+    __asm__ volatile(
+            "xchg %%rbx, %2\n"
+            "cpuid\n"
+            "xchg %%rbx, %2\n"
+            : "=a"(cpuid1), "=d"(cpuid2), "=r"(tmp)
+            : "a"(l0)
+            : "ecx", "ebx");
+    if (cpuid1 >= 0x16)
+    {
+        /* This CPU is recent enough so the timer frequency can be directly queried */
+        __asm__ volatile(
+                "xchg %%rbx, %2\n"
+                "cpuid\n"
+                "xchg %%rbx, %2\n"
+                : "=a"(cpuid1), "=d"(cpuid2), "=r"(tmp)
+                : "a"(l16)
+                : "ecx", "ebx");
+        cycles = static_cast<gmx_cycles_t>(cpuid1) * static_cast<gmx_cycles_t>(1000000);
+        return 1. / cycles;
+    }
+#    endif
+#    ifdef _MSC_VER
 
     /* Windows does not have gettimeofday, but it provides a special
      * routine that returns the cycle counter frequency.
@@ -77,7 +122,7 @@ double gmx_cycles_calibrate(double sampletime)
     return 1.0 / static_cast<double>(i.QuadPart);
     /* end of MS Windows implementation */
 
-#elif HAVE_GETTIMEOFDAY
+#    elif HAVE_GETTIMEOFDAY
 
     /*  generic implementation with gettimeofday() */
     struct timeval t1, t2;
@@ -90,7 +135,7 @@ double gmx_cycles_calibrate(double sampletime)
         return -1;
     }
 
-#    if (defined(__alpha__) || defined(__alpha))
+#        if (defined(__alpha__) || defined(__alpha))
     /* Alpha cannot count to more than 4e9, but I don't expect
      * that the architecture will go over 2GHz before it dies, so
      * up to 2.0 seconds of sampling should be safe.
@@ -99,7 +144,7 @@ double gmx_cycles_calibrate(double sampletime)
     {
         sampletime = 2.0;
     }
-#    endif
+#        endif
 
     /* Start a timing loop. We want this to be largely independent
      * of machine speed, so we need to start with a very small number
@@ -138,9 +183,10 @@ double gmx_cycles_calibrate(double sampletime)
     /* Return seconds per cycle */
     return timediff / cyclediff;
 
-#else
+#    else
     /* No timing function available */
     return -1;
     GMX_UNUSED_VALUE(sampletime);
+#    endif
 #endif
 }
index 9d4cd52eeb04860b614f4c1b2dab611706c90ca3..bd693790b3499337cb7cb0aa44d89a3dd638daef 100644 (file)
@@ -208,6 +208,14 @@ void NonbondedBenchmark::initOptions(IOptionsContainer* options, ICommandLineOpt
     options->addOption(BooleanOption("cycles")
                                .store(&benchmarkOptions_.cyclesPerPair)
                                .description("Report cycles/pair instead of pairs/cycle"));
+    options->addOption(
+            BooleanOption("time").store(&benchmarkOptions_.reportTime).description("Report micro-seconds instead of cycles"));
+    options->addOption(FileNameOption("o")
+                               .filetype(eftCsv)
+                               .outputFile()
+                               .store(&benchmarkOptions_.outputFile)
+                               .defaultBasename("nonbonded-benchmark")
+                               .description("Also output results in csv format"));
 }
 
 void NonbondedBenchmark::optionsFinished()