src/gromacs/hardware/identifyavx512fmaunits.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  * \brief Implements a routine to check the number of AVX512 fma units
  38  *
  39  * Just as the CpuInfo code, we need to be able to compile this file in stand-alone mode
  40  * to set the SIMD acceleration and similar things during CMake configuration.
  41  */
  42
  43 #ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
  44 #    include "gmxpre.h"
  45 #endif
  46
  47 #include "identifyavx512fmaunits.h"
  48
  49 #ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
  50 #    include "config.h"
  51 #endif
  52
  53 #include <cstdint>
  54 #include <cstdio>
  55
  56 #include <algorithm>
  57 #include <mutex>
  58
  59 #ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
  60 #    include "gromacs/hardware/cpuinfo.h"
  61 #endif
  62
  63 namespace gmx
  64 {
  65
  66 namespace
  67 {
  68
  69 #if GMX_X86_GCC_INLINE_ASM && SIMD_AVX_512_CXX_SUPPORTED
  70 /*\ brief Loop over mixed FMA and shuffle AVX512 instructions
  71  *
  72  * This function executes a meaningless loop that includes both
  73  * FMA and shuffle instructions from the AVX512 instruction set.
  74  * We need a bit of complex logic to make sure it cannot be
  75  * optimized away by the compiler.
  76  *
  77  * \param loopCount  Number of iterations. Each iteration will
  78  *                   execute 12 FMA and 12 shuffle instructions.
  79  * \return Number of cycles used for the loop.
  80  */
  81 uint64_t timeFmaAndShuffleLoop(uint64_t loopCount)
  82 {
  83     uint64_t cycles;
  84     // Unfortunately we need to resort to inline ASM since we are
  85     // making a choice based on timing, and without efficient optimization
  86     // (e.g. when doing debugging) the usual intrinsics are often implemented
  87     // as independent load/store operations, which completely screws up timing.
  88     __asm__ __volatile__(
  89             "\tvpxord %%zmm0, %%zmm0, %%zmm0\n"
  90             "\tvmovaps %%zmm0, %%zmm1\n"
  91             "\tvmovaps %%zmm0, %%zmm2\n"
  92             "\tvmovaps %%zmm0, %%zmm3\n"
  93             "\tvmovaps %%zmm0, %%zmm4\n"
  94             "\tvmovaps %%zmm0, %%zmm5\n"
  95             "\tvmovaps %%zmm0, %%zmm6\n"
  96             "\tvmovaps %%zmm0, %%zmm7\n"
  97             "\tvmovaps %%zmm0, %%zmm8\n"
  98             "\tvmovaps %%zmm0, %%zmm9\n"
  99             "\tvmovaps %%zmm0, %%zmm10\n"
 100             "\tvmovaps %%zmm0, %%zmm11\n"
 101             "\tvpxord %%zmm12, %%zmm12, %%zmm12\n"
 102             "\tvmovaps %%zmm12, %%zmm13\n"
 103             "\tvmovaps %%zmm12, %%zmm14\n"
 104             "\tvmovaps %%zmm12, %%zmm15\n"
 105             "\tvmovaps %%zmm12, %%zmm16\n"
 106             "\tvmovaps %%zmm12, %%zmm17\n"
 107             "\tvmovaps %%zmm12, %%zmm18\n"
 108             "\tvmovaps %%zmm12, %%zmm19\n"
 109             "\tvmovaps %%zmm12, %%zmm20\n"
 110             "\tvmovaps %%zmm12, %%zmm21\n"
 111             "\tvmovaps %%zmm12, %%zmm22\n"
 112             "\tvmovaps %%zmm12, %%zmm23\n"
 113             "\tvmovaps %%zmm12, %%zmm30\n"
 114             "\trdtscp\n"
 115             "\tsalq $32, %%rdx\n"
 116             "\tmovl %%eax, %%eax\n"
 117             "\tmovq %%rdx, %%rbx\n"
 118             "\torq %%rax, %%rbx\n"
 119             "\tmovq %1, %%rdx\n"
 120             "1:\n"
 121             "\tvfmadd231pd %%zmm0, %%zmm0, %%zmm0\n"
 122             "\tvfmadd231pd %%zmm1, %%zmm1, %%zmm1\n"
 123             "\tvfmadd231pd %%zmm2, %%zmm2, %%zmm2\n"
 124             "\tvfmadd231pd %%zmm3, %%zmm3, %%zmm3\n"
 125             "\tvfmadd231pd %%zmm4, %%zmm4, %%zmm4\n"
 126             "\tvfmadd231pd %%zmm5, %%zmm5, %%zmm5\n"
 127             "\tvfmadd231pd %%zmm6, %%zmm6, %%zmm6\n"
 128             "\tvfmadd231pd %%zmm7, %%zmm7, %%zmm7\n"
 129             "\tvfmadd231pd %%zmm8, %%zmm8, %%zmm8\n"
 130             "\tvfmadd231pd %%zmm9, %%zmm9, %%zmm9\n"
 131             "\tvfmadd231pd %%zmm10, %%zmm10, %%zmm10\n"
 132             "\tvfmadd231pd %%zmm11, %%zmm11, %%zmm11\n"
 133             "\tvpermd %%zmm30, %%zmm30, %%zmm12\n"
 134             "\tvpermd %%zmm30, %%zmm30, %%zmm13\n"
 135             "\tvpermd %%zmm30, %%zmm30, %%zmm14\n"
 136             "\tvpermd %%zmm30, %%zmm30, %%zmm15\n"
 137             "\tvpermd %%zmm30, %%zmm30, %%zmm16\n"
 138             "\tvpermd %%zmm30, %%zmm30, %%zmm17\n"
 139             "\tvpermd %%zmm30, %%zmm30, %%zmm18\n"
 140             "\tvpermd %%zmm30, %%zmm30, %%zmm19\n"
 141             "\tvpermd %%zmm30, %%zmm30, %%zmm20\n"
 142             "\tvpermd %%zmm30, %%zmm30, %%zmm21\n"
 143             "\tvpermd %%zmm30, %%zmm30, %%zmm22\n"
 144             "\tvpermd %%zmm30, %%zmm30, %%zmm23\n"
 145             "\tdec %%rdx\n"
 146             "\tjg 1b\n"
 147             "\trdtscp\n"
 148             "\tsalq $32, %%rdx\n"
 149             "\tmovl %%eax, %%eax\n"
 150             "\torq %%rax, %%rdx\n"
 151             "\tsubq %%rbx, %%rdx\n"
 152             "\tmovq %%rdx, %0\n"
 153             : "=r"(cycles)
 154             : "r"(loopCount)
 155             : "rax",
 156               "rbx",
 157               "rcx",
 158               "rdx",
 159               "zmm0",
 160               "zmm1",
 161               "zmm2",
 162               "zmm3",
 163               "zmm4",
 164               "zmm5",
 165               "zmm6",
 166               "zmm7",
 167               "zmm8",
 168               "zmm9",
 169               "zmm10",
 170               "zmm11",
 171               "zmm12",
 172               "zmm13",
 173               "zmm14",
 174               "zmm15",
 175               "zmm16",
 176               "zmm17",
 177               "zmm18",
 178               "zmm19",
 179               "zmm20",
 180               "zmm21",
 181               "zmm22",
 182               "zmm23",
 183               "zmm30");
 184
 185     return cycles;
 186 }
 187
 188 /*\ brief Loop over FMA AVX512 instructions
 189  *
 190  * This function executes a meaningless loop that includes only
 191  * FMA instructions from the AVX512 instruction set.
 192  * We need a bit of complex logic to make sure it cannot be
 193  * optimized away by the compiler.
 194  *
 195  * \param loopCount  Number of iterations. Each iteration will
 196  *                   execute 12 FMA instructions.
 197  * \return Number of cycles used for the loop.
 198  */
 199 uint64_t timeFmaOnlyLoop(uint64_t loopCount)
 200 {
 201     uint64_t cycles;
 202     // Unfortunately we need to resort to inline ASM since we are
 203     // making a choice based on timing, and without efficient optimization
 204     // (e.g. when doing debugging) the usual intrinsics are often implemented
 205     // as independent load/store operations, which completely screws up timing.
 206     __asm__ __volatile__(
 207             "\tvpxord %%zmm0, %%zmm0, %%zmm0\n"
 208             "\tvmovaps %%zmm0, %%zmm1\n"
 209             "\tvmovaps %%zmm0, %%zmm2\n"
 210             "\tvmovaps %%zmm0, %%zmm3\n"
 211             "\tvmovaps %%zmm0, %%zmm4\n"
 212             "\tvmovaps %%zmm0, %%zmm5\n"
 213             "\tvmovaps %%zmm0, %%zmm6\n"
 214             "\tvmovaps %%zmm0, %%zmm7\n"
 215             "\tvmovaps %%zmm0, %%zmm8\n"
 216             "\tvmovaps %%zmm0, %%zmm9\n"
 217             "\tvmovaps %%zmm0, %%zmm10\n"
 218             "\tvmovaps %%zmm0, %%zmm11\n"
 219             "\trdtscp\n"
 220             "\tsalq $32, %%rdx\n"
 221             "\tmovl %%eax, %%eax\n"
 222             "\tmovq %%rdx, %%rbx\n"
 223             "\torq %%rax, %%rbx\n"
 224             "\tmovq %1, %%rdx\n"
 225             "1:\n"
 226             "\tvfmadd231pd %%zmm0, %%zmm0, %%zmm0\n"
 227             "\tvfmadd231pd %%zmm1, %%zmm1, %%zmm1\n"
 228             "\tvfmadd231pd %%zmm2, %%zmm2, %%zmm2\n"
 229             "\tvfmadd231pd %%zmm3, %%zmm3, %%zmm3\n"
 230             "\tvfmadd231pd %%zmm4, %%zmm4, %%zmm4\n"
 231             "\tvfmadd231pd %%zmm5, %%zmm5, %%zmm5\n"
 232             "\tvfmadd231pd %%zmm6, %%zmm6, %%zmm6\n"
 233             "\tvfmadd231pd %%zmm7, %%zmm7, %%zmm7\n"
 234             "\tvfmadd231pd %%zmm8, %%zmm8, %%zmm8\n"
 235             "\tvfmadd231pd %%zmm9, %%zmm9, %%zmm9\n"
 236             "\tvfmadd231pd %%zmm10, %%zmm10, %%zmm10\n"
 237             "\tvfmadd231pd %%zmm11, %%zmm11, %%zmm11\n"
 238             "\tdec %%rdx\n"
 239             "\tjg 1b\n"
 240             "\trdtscp\n"
 241             "\tsalq $32, %%rdx\n"
 242             "\tmovl %%eax, %%eax\n"
 243             "\torq %%rax, %%rdx\n"
 244             "\tsubq %%rbx, %%rdx\n"
 245             "\tmovq %%rdx, %0\n"
 246             : "=r"(cycles)
 247             : "r"(loopCount)
 248             : "rax",
 249               "rbx",
 250               "rcx",
 251               "rdx",
 252               "zmm0",
 253               "zmm1",
 254               "zmm2",
 255               "zmm3",
 256               "zmm4",
 257               "zmm5",
 258               "zmm6",
 259               "zmm7",
 260               "zmm8",
 261               "zmm9",
 262               "zmm10",
 263               "zmm11");
 264
 265     return cycles;
 266 }
 267
 268 bool checkDualAvx512FmaUnits()
 269 {
 270     uint64_t timeFmaAndShuf = static_cast<uint64_t>(1e9); // Large value
 271
 272     // Make sure the CPU is in AVX512 mode by executing a fairly long loop.
 273     // Use the return value to make sure it is not optimized away. Later invocations
 274     // use fewer iterations, so they should always be faster.
 275     uint64_t timeFmaOnly = timeFmaOnlyLoop(100000);
 276
 277     // Execute the loops three times
 278     for (int i = 0; i < 3; i++)
 279     {
 280         timeFmaAndShuf = std::min(timeFmaAndShuf, timeFmaAndShuffleLoop(1000));
 281         timeFmaOnly    = std::min(timeFmaOnly, timeFmaOnlyLoop(1000));
 282     }
 283
 284     return timeFmaAndShuf > 1.5 * timeFmaOnly;
 285 }
 286
 287 #endif // GMX_X86_GCC_INLINE_ASM && SIMD_AVX_512_CXX_SUPPORTED
 288
 289
 290 /*! \brief Mutex to guard the execution of the timing test
 291  *
 292  * We only execute the test once, and return the saved result
 293  * on subsequent calls.
 294  */
 295 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
 296 std::mutex initMutex;
 297
 298 } // namespace
 299
 300 int identifyAvx512FmaUnits()
 301 {
 302     static bool initialized = false;
 303     static int  result      = 0;
 304
 305     if (!initialized)
 306     {
 307         std::lock_guard<std::mutex> lock(initMutex);
 308
 309         if (!initialized)
 310         {
 311             // For the standalone test binary we assume it will
 312             // only be executed on AVX512 hardware, but for the
 313             // library version we check the hardware support.
 314 #ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
 315             bool haveAvx512Hardware = true;
 316 #else
 317             bool haveAvx512Hardware = CpuInfo::detect().feature(CpuInfo::Feature::X86_Avx512F);
 318 #endif
 319
 320             if (haveAvx512Hardware)
 321             {
 322 #if GMX_X86_GCC_INLINE_ASM && SIMD_AVX_512_CXX_SUPPORTED
 323                 result = checkDualAvx512FmaUnits() ? 2 : 1;
 324 #else
 325                 result = -1; // Cannot run the tests
 326 #endif
 327             }
 328             else
 329             {
 330                 result = 0; // Not AVX-512 hardware
 331             }
 332             initialized = true;
 333         }
 334     }
 335     return result;
 336 }
 337
 338 } // namespace gmx
 339
 340 #ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
 341 int main()
 342 {
 343     printf("%d\n", gmx::identifyAvx512FmaUnits());
 344     return 0;
 345 }
 346 #endif