return "the requested SIMD kernel was not set up at configuration time";
}
+ if (options.reportTime && (0 > gmx_cycles_calibrate(1.0)))
+ {
+ return "the -time option is not supported on this system";
+ }
+
return {};
}
options.coulombType == BenchMarkCoulomb::Pme ? "Ewald" : "RF",
options.useHalfLJOptimization ? "half" : "all",
combruleNames[options.ljCombinationRule].c_str(), kernelNames[options.nbnxmSimd].c_str());
+ if (!options.outputFile.empty())
+ {
+ fprintf(system.csv,
+ "\"%d\",\"%zu\",\"%g\",\"%d\",\"%d\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%"
+ "s\",",
+#if GMX_SIMD
+ (options.nbnxmSimd != BenchMarkKernels::SimdNo) ? GMX_SIMD_REAL_WIDTH : 0,
+#else
+ 0,
+#endif
+ system.coordinates.size(), options.pairlistCutoff, options.numThreads,
+ options.numIterations, options.computeVirialAndEnergy ? "yes" : "no",
+ (options.coulombType != BenchMarkCoulomb::ReactionField)
+ ? ((options.nbnxmSimd == BenchMarkKernels::SimdNo || options.useTabulatedEwaldCorr)
+ ? "table"
+ : "analytical")
+ : "",
+ options.coulombType == BenchMarkCoulomb::Pme ? "Ewald" : "RF",
+ options.useHalfLJOptimization ? "half" : "all",
+ combruleNames[options.ljCombinationRule].c_str(),
+ kernelNames[options.nbnxmSimd].c_str());
+ }
}
// Run pre-iteration to avoid cache misses
cycles = gmx_cycles_read() - cycles;
if (!doWarmup)
{
- const double dCycles = static_cast<double>(cycles);
- if (options.cyclesPerPair)
+ if (options.reportTime)
{
- fprintf(stdout, "%10.3f %10.4f %8.4f %8.4f\n", cycles * 1e-6,
- dCycles / options.numIterations * 1e-6, dCycles / (options.numIterations * numPairs),
- dCycles / (options.numIterations * numUsefulPairs));
+ const double uSec = static_cast<double>(cycles) * gmx_cycles_calibrate(1.0) * 1.e6;
+ if (options.cyclesPerPair)
+ {
+ fprintf(stdout, "%13.2f %13.3f %10.3f %10.3f\n", uSec, uSec / options.numIterations,
+ uSec / (options.numIterations * numPairs),
+ uSec / (options.numIterations * numUsefulPairs));
+ if (!options.outputFile.empty())
+ {
+ fprintf(system.csv, "\"%.3f\",\"%.4f\",\"%.4f\",\"%.4f\"\n", uSec,
+ uSec / options.numIterations, uSec / (options.numIterations * numPairs),
+ uSec / (options.numIterations * numUsefulPairs));
+ }
+ }
+ else
+ {
+ fprintf(stdout, "%13.2f %13.3f %10.3f %10.3f\n", uSec, uSec / options.numIterations,
+ options.numIterations * numPairs / uSec,
+ options.numIterations * numUsefulPairs / uSec);
+ if (!options.outputFile.empty())
+ {
+ fprintf(system.csv, "\"%.3f\",\"%.4f\",\"%.4f\",\"%.4f\"\n", uSec,
+ uSec / options.numIterations, options.numIterations * numPairs / uSec,
+ options.numIterations * numUsefulPairs / uSec);
+ }
+ }
}
else
{
- fprintf(stdout, "%10.3f %10.4f %8.4f %8.4f\n", dCycles * 1e-6,
- dCycles / options.numIterations * 1e-6, options.numIterations * numPairs / dCycles,
- options.numIterations * numUsefulPairs / dCycles);
+ const double dCycles = static_cast<double>(cycles);
+ if (options.cyclesPerPair)
+ {
+ fprintf(stdout, "%10.3f %10.4f %8.4f %8.4f\n", cycles * 1e-6,
+ dCycles / options.numIterations * 1e-6,
+ dCycles / (options.numIterations * numPairs),
+ dCycles / (options.numIterations * numUsefulPairs));
+ }
+ else
+ {
+ fprintf(stdout, "%10.3f %10.4f %8.4f %8.4f\n", dCycles * 1e-6,
+ dCycles / options.numIterations * 1e-6, options.numIterations * numPairs / dCycles,
+ options.numIterations * numUsefulPairs / dCycles);
+ }
}
}
}
gmx_omp_nthreads_set(emntPairsearch, options.numThreads);
gmx_omp_nthreads_set(emntNonbonded, options.numThreads);
- const gmx::BenchmarkSystem system(sizeFactor);
+ const gmx::BenchmarkSystem system(sizeFactor, options.outputFile);
real minBoxSize = norm(system.box[XX]);
for (int dim = YY; dim < DIM; dim++)
setupAndRunInstance(system, optionsList[0], true);
}
- fprintf(stdout, "Coulomb LJ comb. SIMD Mcycles Mcycles/it. %s\n",
- options.cyclesPerPair ? "cycles/pair" : "pairs/cycle");
- fprintf(stdout, " total useful\n");
+ if (options.reportTime)
+ {
+ fprintf(stdout, "Coulomb LJ comb. SIMD usec usec/it. %s\n",
+ options.cyclesPerPair ? "usec/pair" : "pairs/usec");
+ if (!options.outputFile.empty())
+ {
+ fprintf(system.csv,
+ "\"width\",\"atoms\",\"cut-off radius\",\"threads\",\"iter\",\"compute "
+ "energy\",\"Ewald excl. "
+ "corr.\",\"Coulomb\",\"LJ\",\"comb\",\"SIMD\",\"usec\",\"usec/it\",\"total "
+ "pairs/usec\",\"useful pairs/usec\"\n");
+ }
+ fprintf(stdout,
+ " total useful\n");
+ }
+ else
+ {
+ fprintf(stdout, "Coulomb LJ comb. SIMD Mcycles Mcycles/it. %s\n",
+ options.cyclesPerPair ? "cycles/pair" : "pairs/cycle");
+ if (!options.outputFile.empty())
+ {
+ fprintf(system.csv,
+ "\"width\",\"atoms\",\"cut-off radius\",\"threads\",\"iter\",\"compute "
+ "energy\",\"Ewald excl. "
+ "corr.\",\"Coulomb\",\"LJ\",\"comb\",\"SIMD\",\"Mcycles\",\"Mcycles/"
+ "it\",\"total "
+ "total cycles/pair\",\"total cycles per useful pair\"\n");
+ }
+ fprintf(stdout, " total useful\n");
+ }
for (const auto& optionsInstance : optionsList)
{
setupAndRunInstance(system, optionsInstance, false);
}
+
+ if (!options.outputFile.empty())
+ {
+ fclose(system.csv);
+ }
}
} // namespace Nbnxm
*/
double gmx_cycles_calibrate(double sampletime)
{
-#ifdef _MSC_VER
+ /* On ARM and recent-generation x86-64, we can use the more accurate cycle counters
+ * that allow better timing for things that depend on it (e.g. load balancing, profiling).
+ */
+#if ((defined __aarch64__) \
+ && (defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__) || defined(__PGIC__)))
+ /* 64-bit ARM cycle counters with GCC inline assembly */
+ unsigned long cycles;
+ __asm__ __volatile__("mrs %0, cntfrq_el0" : "=r"(cycles));
+ /* Only first 32 bits are significant */
+ cycles &= 0xFFFFFFFF;
+ return 1. / cycles;
+ GMX_UNUSED_VALUE(sampletime);
+#else
+# if ((defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__PATHSCALE__) || defined(__PGIC__)) \
+ && defined(__x86_64__) && !defined(_CRAYC))
+ long gmx_unused tmp;
+ int cpuid1;
+ int gmx_unused cpuid2;
+ const int l0 = 0x0;
+ const int l16 = 0x16;
+ gmx_cycles_t cycles;
+
+ /* cpuid clobbers ebx but it must be restored for -fPIC so save
+ * then restore ebx */
+ __asm__ volatile(
+ "xchg %%rbx, %2\n"
+ "cpuid\n"
+ "xchg %%rbx, %2\n"
+ : "=a"(cpuid1), "=d"(cpuid2), "=r"(tmp)
+ : "a"(l0)
+ : "ecx", "ebx");
+ if (cpuid1 >= 0x16)
+ {
+ /* This CPU is recent enough so the timer frequency can be directly queried */
+ __asm__ volatile(
+ "xchg %%rbx, %2\n"
+ "cpuid\n"
+ "xchg %%rbx, %2\n"
+ : "=a"(cpuid1), "=d"(cpuid2), "=r"(tmp)
+ : "a"(l16)
+ : "ecx", "ebx");
+ cycles = static_cast<gmx_cycles_t>(cpuid1) * static_cast<gmx_cycles_t>(1000000);
+ return 1. / cycles;
+ }
+# endif
+# ifdef _MSC_VER
/* Windows does not have gettimeofday, but it provides a special
* routine that returns the cycle counter frequency.
return 1.0 / static_cast<double>(i.QuadPart);
/* end of MS Windows implementation */
-#elif HAVE_GETTIMEOFDAY
+# elif HAVE_GETTIMEOFDAY
/* generic implementation with gettimeofday() */
struct timeval t1, t2;
return -1;
}
-# if (defined(__alpha__) || defined(__alpha))
+# if (defined(__alpha__) || defined(__alpha))
/* Alpha cannot count to more than 4e9, but I don't expect
* that the architecture will go over 2GHz before it dies, so
* up to 2.0 seconds of sampling should be safe.
{
sampletime = 2.0;
}
-# endif
+# endif
/* Start a timing loop. We want this to be largely independent
* of machine speed, so we need to start with a very small number
/* Return seconds per cycle */
return timediff / cyclediff;
-#else
+# else
/* No timing function available */
return -1;
GMX_UNUSED_VALUE(sampletime);
+# endif
#endif
}