timing: try to directly retrieve gmx_cycles_calibrate() from the hardware

author Gilles Gouaillardet <gilles@rist.or.jp>

Tue, 3 Nov 2020 11:23:27 +0000 (11:23 +0000)

committer Paul Bauer <paul.bauer.q@gmail.com>

Tue, 3 Nov 2020 11:23:27 +0000 (11:23 +0000)
author Gilles Gouaillardet <gilles@rist.or.jp>
Tue, 3 Nov 2020 11:23:27 +0000 (11:23 +0000)
committer Paul Bauer <paul.bauer.q@gmail.com>
Tue, 3 Nov 2020 11:23:27 +0000 (11:23 +0000)
diff --git a/src/gromacs/nbnxm/benchmark/bench_setup.cpp b/src/gromacs/nbnxm/benchmark/bench_setup.cpp

index ee4427e9b255438108ffeec00ac835afecd1d395..9fffb257ea21f4f4db102c79b1b4d7e1d225fba8 100644 (file)
--- a/src/gromacs/nbnxm/benchmark/bench_setup.cpp
+++ b/src/gromacs/nbnxm/benchmark/bench_setup.cpp
@@ -100,6 +100,11 @@ static std::optional<std::string> checkKernelSetup(const KernelBenchOptions& opt
          return "the requested SIMD kernel was not set up at configuration time";
      }
  
+    if (options.reportTime && (0 > gmx_cycles_calibrate(1.0)))
+    {
+        return "the -time option is not supported on this system";
+    }
+
      return {};
  }
  
@@ -304,6 +309,28 @@ static void setupAndRunInstance(const gmx::BenchmarkSystem& system,
                  options.coulombType == BenchMarkCoulomb::Pme ? "Ewald" : "RF",
                  options.useHalfLJOptimization ? "half" : "all",
                  combruleNames[options.ljCombinationRule].c_str(), kernelNames[options.nbnxmSimd].c_str());
+        if (!options.outputFile.empty())
+        {
+            fprintf(system.csv,
+                    "\"%d\",\"%zu\",\"%g\",\"%d\",\"%d\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%"
+                    "s\",",
+#if GMX_SIMD
+                    (options.nbnxmSimd != BenchMarkKernels::SimdNo) ? GMX_SIMD_REAL_WIDTH : 0,
+#else
+                    0,
+#endif
+                    system.coordinates.size(), options.pairlistCutoff, options.numThreads,
+                    options.numIterations, options.computeVirialAndEnergy ? "yes" : "no",
+                    (options.coulombType != BenchMarkCoulomb::ReactionField)
+                            ? ((options.nbnxmSimd == BenchMarkKernels::SimdNo || options.useTabulatedEwaldCorr)
+                                       ? "table"
+                                       : "analytical")
+                            : "",
+                    options.coulombType == BenchMarkCoulomb::Pme ? "Ewald" : "RF",
+                    options.useHalfLJOptimization ? "half" : "all",
+                    combruleNames[options.ljCombinationRule].c_str(),
+                    kernelNames[options.nbnxmSimd].c_str());
+        }
      }
  
      // Run pre-iteration to avoid cache misses
@@ -326,18 +353,50 @@ static void setupAndRunInstance(const gmx::BenchmarkSystem& system,
      cycles = gmx_cycles_read() - cycles;
      if (!doWarmup)
      {
-        const double dCycles = static_cast<double>(cycles);
-        if (options.cyclesPerPair)
+        if (options.reportTime)
          {
-            fprintf(stdout, "%10.3f %10.4f %8.4f %8.4f\n", cycles * 1e-6,
-                    dCycles / options.numIterations * 1e-6, dCycles / (options.numIterations * numPairs),
-                    dCycles / (options.numIterations * numUsefulPairs));
+            const double uSec = static_cast<double>(cycles) * gmx_cycles_calibrate(1.0) * 1.e6;
+            if (options.cyclesPerPair)
+            {
+                fprintf(stdout, "%13.2f %13.3f %10.3f %10.3f\n", uSec, uSec / options.numIterations,
+                        uSec / (options.numIterations * numPairs),
+                        uSec / (options.numIterations * numUsefulPairs));
+                if (!options.outputFile.empty())
+                {
+                    fprintf(system.csv, "\"%.3f\",\"%.4f\",\"%.4f\",\"%.4f\"\n", uSec,
+                            uSec / options.numIterations, uSec / (options.numIterations * numPairs),
+                            uSec / (options.numIterations * numUsefulPairs));
+                }
+            }
+            else
+            {
+                fprintf(stdout, "%13.2f %13.3f %10.3f %10.3f\n", uSec, uSec / options.numIterations,
+                        options.numIterations * numPairs / uSec,
+                        options.numIterations * numUsefulPairs / uSec);
+                if (!options.outputFile.empty())
+                {
+                    fprintf(system.csv, "\"%.3f\",\"%.4f\",\"%.4f\",\"%.4f\"\n", uSec,
+                            uSec / options.numIterations, options.numIterations * numPairs / uSec,
+                            options.numIterations * numUsefulPairs / uSec);
+                }
+            }
          }
          else
          {
-            fprintf(stdout, "%10.3f %10.4f %8.4f %8.4f\n", dCycles * 1e-6,
-                    dCycles / options.numIterations * 1e-6, options.numIterations * numPairs / dCycles,
-                    options.numIterations * numUsefulPairs / dCycles);
+            const double dCycles = static_cast<double>(cycles);
+            if (options.cyclesPerPair)
+            {
+                fprintf(stdout, "%10.3f %10.4f %8.4f %8.4f\n", cycles * 1e-6,
+                        dCycles / options.numIterations * 1e-6,
+                        dCycles / (options.numIterations * numPairs),
+                        dCycles / (options.numIterations * numUsefulPairs));
+            }
+            else
+            {
+                fprintf(stdout, "%10.3f %10.4f %8.4f %8.4f\n", dCycles * 1e-6,
+                        dCycles / options.numIterations * 1e-6, options.numIterations * numPairs / dCycles,
+                        options.numIterations * numUsefulPairs / dCycles);
+            }
          }
      }
  }
@@ -348,7 +407,7 @@ void bench(const int sizeFactor, const KernelBenchOptions& options)
      gmx_omp_nthreads_set(emntPairsearch, options.numThreads);
      gmx_omp_nthreads_set(emntNonbonded, options.numThreads);
  
-    const gmx::BenchmarkSystem system(sizeFactor);
+    const gmx::BenchmarkSystem system(sizeFactor, options.outputFile);
  
      real minBoxSize = norm(system.box[XX]);
      for (int dim = YY; dim < DIM; dim++)
@@ -413,14 +472,46 @@ void bench(const int sizeFactor, const KernelBenchOptions& options)
          setupAndRunInstance(system, optionsList[0], true);
      }
  
-    fprintf(stdout, "Coulomb LJ   comb. SIMD    Mcycles  Mcycles/it.   %s\n",
-            options.cyclesPerPair ? "cycles/pair" : "pairs/cycle");
-    fprintf(stdout, "                                                total    useful\n");
+    if (options.reportTime)
+    {
+        fprintf(stdout, "Coulomb LJ   comb. SIMD       usec         usec/it.        %s\n",
+                options.cyclesPerPair ? "usec/pair" : "pairs/usec");
+        if (!options.outputFile.empty())
+        {
+            fprintf(system.csv,
+                    "\"width\",\"atoms\",\"cut-off radius\",\"threads\",\"iter\",\"compute "
+                    "energy\",\"Ewald excl. "
+                    "corr.\",\"Coulomb\",\"LJ\",\"comb\",\"SIMD\",\"usec\",\"usec/it\",\"total "
+                    "pairs/usec\",\"useful pairs/usec\"\n");
+        }
+        fprintf(stdout,
+                "                                                        total      useful\n");
+    }
+    else
+    {
+        fprintf(stdout, "Coulomb LJ   comb. SIMD    Mcycles  Mcycles/it.   %s\n",
+                options.cyclesPerPair ? "cycles/pair" : "pairs/cycle");
+        if (!options.outputFile.empty())
+        {
+            fprintf(system.csv,
+                    "\"width\",\"atoms\",\"cut-off radius\",\"threads\",\"iter\",\"compute "
+                    "energy\",\"Ewald excl. "
+                    "corr.\",\"Coulomb\",\"LJ\",\"comb\",\"SIMD\",\"Mcycles\",\"Mcycles/"
+                    "it\",\"total "
+                    "total cycles/pair\",\"total cycles per useful pair\"\n");
+        }
+        fprintf(stdout, "                                                total    useful\n");
+    }
  
      for (const auto& optionsInstance : optionsList)
      {
          setupAndRunInstance(system, optionsInstance, false);
      }
+
+    if (!options.outputFile.empty())
+    {
+        fclose(system.csv);
+    }
  }
  
  } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/benchmark/bench_setup.h b/src/gromacs/nbnxm/benchmark/bench_setup.h

index 2e33352376579ff74a89a900b12477b2b7d907c2..d7b0f296fb089388254fdd54a50792e878d0ed11 100644 (file)
--- a/src/gromacs/nbnxm/benchmark/bench_setup.h
+++ b/src/gromacs/nbnxm/benchmark/bench_setup.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -46,6 +46,8 @@
  #ifndef GMX_NBNXN_BENCH_SETUP_H
  #define GMX_NBNXN_BENCH_SETUP_H
  
+#include <string>
+
  #include "gromacs/utility/real.h"
  
  namespace Nbnxm
@@ -113,6 +115,10 @@ struct KernelBenchOptions
      int numWarmupIterations = 0;
      //! Print cycles/pair instead of pairs/cycle
      bool cyclesPerPair = false;
+    //! Report in micro seconds instead of cycles
+    bool reportTime = false;
+    //! Also report into a csv file
+    std::string outputFile;
  };
  
  /*! \brief
diff --git a/src/gromacs/nbnxm/benchmark/bench_system.cpp b/src/gromacs/nbnxm/benchmark/bench_system.cpp

index 553f4cafd26df838b92570d1e4187bfe4bc5bc98..de4d738969b6e6daeebc94a9805a857c0682fb3f 100644 (file)
--- a/src/gromacs/nbnxm/benchmark/bench_system.cpp
+++ b/src/gromacs/nbnxm/benchmark/bench_system.cpp
@@ -150,7 +150,7 @@ static void generateCoordinates(int multiplicationFactor, std::vector<gmx::RVec>
      }
  }
  
-BenchmarkSystem::BenchmarkSystem(const int multiplicationFactor)
+BenchmarkSystem::BenchmarkSystem(const int multiplicationFactor, const std::string& outputFile)
  {
      numAtomTypes = 2;
      nonbondedParameters.resize(numAtomTypes * numAtomTypes * 2, 0);
@@ -199,6 +199,10 @@ BenchmarkSystem::BenchmarkSystem(const int multiplicationFactor)
      forceRec.nbfp  = nonbondedParameters;
      snew(forceRec.shift_vec, SHIFTS);
      calc_shifts(box, forceRec.shift_vec);
+    if (!outputFile.empty())
+    {
+        csv = fopen(outputFile.c_str(), "w+");
+    }
  }
  
  } // namespace gmx
diff --git a/src/gromacs/nbnxm/benchmark/bench_system.h b/src/gromacs/nbnxm/benchmark/bench_system.h

index adcc85d4ffacc443b36b48dc8040e09ffa2fb1aa..acf02326699b1ba27e85e788e62dd2b58f7b6c10 100644 (file)
--- a/src/gromacs/nbnxm/benchmark/bench_system.h
+++ b/src/gromacs/nbnxm/benchmark/bench_system.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -44,6 +44,7 @@
  #ifndef GMX_NBNXN_BENCH_SYSTEM_H
  #define GMX_NBNXN_BENCH_SYSTEM_H
  
+#include <string>
  #include <vector>
  
  #include "gromacs/math/vectypes.h"
@@ -64,8 +65,9 @@ struct BenchmarkSystem
       * with 3000 atoms total.
       *
       * \param[in] multiplicationFactor  Should be a power of 2, is checked
+     * \param[in] outputFile            The name of the csv file to write benchmark results
       */
-    BenchmarkSystem(int multiplicationFactor);
+    BenchmarkSystem(int multiplicationFactor, const std::string& outputFile);
  
      //! Number of different atom types in test system.
      int numAtomTypes;
@@ -87,6 +89,8 @@ struct BenchmarkSystem
      matrix box;
      //! Forcerec with only the entries used in the benchmark set
      t_forcerec forceRec;
+    //! csv output file
+    FILE* csv;
  };
  
  } // namespace gmx
diff --git a/src/gromacs/timing/cyclecounter.cpp b/src/gromacs/timing/cyclecounter.cpp

index 6c1a0d43669e1265d9d51efcfc250567fc54df9b..30637ade0c622e6a7a2b40d3e90d5568cff457c9 100644 (file)
--- a/src/gromacs/timing/cyclecounter.cpp
+++ b/src/gromacs/timing/cyclecounter.cpp
@@ -65,7 +65,52 @@
   */
  double gmx_cycles_calibrate(double sampletime)
  {
-#ifdef _MSC_VER
+    /* On ARM and recent-generation x86-64, we can use the more accurate cycle counters
+     * that allow better timing for things that depend on it (e.g. load balancing, profiling).
+     */
+#if ((defined __aarch64__) \
+     && (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__) || defined(__PGIC__)))
+    /* 64-bit ARM cycle counters with GCC inline assembly */
+    unsigned long cycles;
+    __asm__ __volatile__("mrs %0, cntfrq_el0" : "=r"(cycles));
+    /* Only first 32 bits are significant */
+    cycles &= 0xFFFFFFFF;
+    return 1. / cycles;
+    GMX_UNUSED_VALUE(sampletime);
+#else
+#    if ((defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__) || defined(__PGIC__)) \
+         && defined(__x86_64__) && !defined(_CRAYC))
+    long gmx_unused tmp;
+    int             cpuid1;
+    int gmx_unused cpuid2;
+    const int      l0  = 0x0;
+    const int      l16 = 0x16;
+    gmx_cycles_t   cycles;
+
+    /* cpuid clobbers ebx but it must be restored for -fPIC so save
+     * then restore ebx */
+    __asm__ volatile(
+            "xchg %%rbx, %2\n"
+            "cpuid\n"
+            "xchg %%rbx, %2\n"
+            : "=a"(cpuid1), "=d"(cpuid2), "=r"(tmp)
+            : "a"(l0)
+            : "ecx", "ebx");
+    if (cpuid1 >= 0x16)
+    {
+        /* This CPU is recent enough so the timer frequency can be directly queried */
+        __asm__ volatile(
+                "xchg %%rbx, %2\n"
+                "cpuid\n"
+                "xchg %%rbx, %2\n"
+                : "=a"(cpuid1), "=d"(cpuid2), "=r"(tmp)
+                : "a"(l16)
+                : "ecx", "ebx");
+        cycles = static_cast<gmx_cycles_t>(cpuid1) * static_cast<gmx_cycles_t>(1000000);
+        return 1. / cycles;
+    }
+#    endif
+#    ifdef _MSC_VER
  
      /* Windows does not have gettimeofday, but it provides a special
       * routine that returns the cycle counter frequency.
@@ -77,7 +122,7 @@ double gmx_cycles_calibrate(double sampletime)
      return 1.0 / static_cast<double>(i.QuadPart);
      /* end of MS Windows implementation */
  
-#elif HAVE_GETTIMEOFDAY
+#    elif HAVE_GETTIMEOFDAY
  
      /*  generic implementation with gettimeofday() */
      struct timeval t1, t2;
@@ -90,7 +135,7 @@ double gmx_cycles_calibrate(double sampletime)
          return -1;
      }
  
-#    if (defined(__alpha__) || defined(__alpha))
+#        if (defined(__alpha__) || defined(__alpha))
      /* Alpha cannot count to more than 4e9, but I don't expect
       * that the architecture will go over 2GHz before it dies, so
       * up to 2.0 seconds of sampling should be safe.
@@ -99,7 +144,7 @@ double gmx_cycles_calibrate(double sampletime)
      {
          sampletime = 2.0;
      }
-#    endif
+#        endif
  
      /* Start a timing loop. We want this to be largely independent
       * of machine speed, so we need to start with a very small number
@@ -138,9 +183,10 @@ double gmx_cycles_calibrate(double sampletime)
      /* Return seconds per cycle */
      return timediff / cyclediff;
  
-#else
+#    else
      /* No timing function available */
      return -1;
      GMX_UNUSED_VALUE(sampletime);
+#    endif
  #endif
  }
diff --git a/src/programs/mdrun/nonbonded_bench.cpp b/src/programs/mdrun/nonbonded_bench.cpp

index 9d4cd52eeb04860b614f4c1b2dab611706c90ca3..bd693790b3499337cb7cb0aa44d89a3dd638daef 100644 (file)
--- a/src/programs/mdrun/nonbonded_bench.cpp
+++ b/src/programs/mdrun/nonbonded_bench.cpp
@@ -208,6 +208,14 @@ void NonbondedBenchmark::initOptions(IOptionsContainer* options, ICommandLineOpt
      options->addOption(BooleanOption("cycles")
                                 .store(&benchmarkOptions_.cyclesPerPair)
                                 .description("Report cycles/pair instead of pairs/cycle"));
+    options->addOption(
+            BooleanOption("time").store(&benchmarkOptions_.reportTime).description("Report micro-seconds instead of cycles"));
+    options->addOption(FileNameOption("o")
+                               .filetype(eftCsv)
+                               .outputFile()
+                               .store(&benchmarkOptions_.outputFile)
+                               .defaultBasename("nonbonded-benchmark")
+                               .description("Also output results in csv format"));
  }
  
  void NonbondedBenchmark::optionsFinished()
author	Gilles Gouaillardet <gilles@rist.or.jp>
	Tue, 3 Nov 2020 11:23:27 +0000 (11:23 +0000)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Tue, 3 Nov 2020 11:23:27 +0000 (11:23 +0000)
src/gromacs/nbnxm/benchmark/bench_setup.cpp		patch \| blob \| history
src/gromacs/nbnxm/benchmark/bench_setup.h		patch \| blob \| history
src/gromacs/nbnxm/benchmark/bench_system.cpp		patch \| blob \| history
src/gromacs/nbnxm/benchmark/bench_system.h		patch \| blob \| history
src/gromacs/timing/cyclecounter.cpp		patch \| blob \| history
src/programs/mdrun/nonbonded_bench.cpp		patch \| blob \| history