src/gromacs/hardware/identifyavx512fmaunits.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2017,2018, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 /*! \internal \file
  37  * \brief Implements a routine to check the number of AVX512 fma units
  38  *
  39  * Just as the CpuInfo code, we need to be able to compile this file in stand-alone mode
  40  * to set the SIMD acceleration and similar things during CMake configuration.
  41  */
  42
  43 #ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
  44 #include "gmxpre.h"
  45 #endif
  46
  47 #include "identifyavx512fmaunits.h"
  48
  49 #ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
  50 #include "config.h"
  51 #endif
  52
  53 #include <cstdint>
  54 #include <cstdio>
  55
  56 #include <algorithm>
  57 #include <mutex>
  58
  59 #ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
  60 #include "gromacs/hardware/cpuinfo.h"
  61 #endif
  62
  63 namespace gmx
  64 {
  65
  66 namespace
  67 {
  68
  69 #if GMX_X86_GCC_INLINE_ASM && SIMD_AVX_512_CXX_SUPPORTED
  70 /*\ brief Loop over mixed FMA and shuffle AVX512 instructions
  71  *
  72  * This function executes a meaningless loop that includes both
  73  * FMA and shuffle instructions from the AVX512 instruction set.
  74  * We need a bit of complex logic to make sure it cannot be
  75  * optimized away by the compiler.
  76  *
  77  * \param loopCount  Number of iterations. Each iteration will
  78  *                   execute 12 FMA and 12 shuffle instructions.
  79  * \return Number of cycles used for the loop.
  80  */
  81 uint64_t
  82 timeFmaAndShuffleLoop(uint64_t loopCount)
  83 {
  84     uint64_t cycles;
  85     // Unfortunately we need to resort to inline ASM since we are
  86     // making a choice based on timing, and without efficient optimization
  87     // (e.g. when doing debugging) the usual intrinsics are often implemented
  88     // as independent load/store operations, which completely screws up timing.
  89     __asm__ __volatile__("\tvpxord %%zmm0, %%zmm0, %%zmm0\n"
  90                          "\tvmovaps %%zmm0, %%zmm1\n"
  91                          "\tvmovaps %%zmm0, %%zmm2\n"
  92                          "\tvmovaps %%zmm0, %%zmm3\n"
  93                          "\tvmovaps %%zmm0, %%zmm4\n"
  94                          "\tvmovaps %%zmm0, %%zmm5\n"
  95                          "\tvmovaps %%zmm0, %%zmm6\n"
  96                          "\tvmovaps %%zmm0, %%zmm7\n"
  97                          "\tvmovaps %%zmm0, %%zmm8\n"
  98                          "\tvmovaps %%zmm0, %%zmm9\n"
  99                          "\tvmovaps %%zmm0, %%zmm10\n"
 100                          "\tvmovaps %%zmm0, %%zmm11\n"
 101                          "\tvpxord %%zmm12, %%zmm12, %%zmm12\n"
 102                          "\tvmovaps %%zmm12, %%zmm13\n"
 103                          "\tvmovaps %%zmm12, %%zmm14\n"
 104                          "\tvmovaps %%zmm12, %%zmm15\n"
 105                          "\tvmovaps %%zmm12, %%zmm16\n"
 106                          "\tvmovaps %%zmm12, %%zmm17\n"
 107                          "\tvmovaps %%zmm12, %%zmm18\n"
 108                          "\tvmovaps %%zmm12, %%zmm19\n"
 109                          "\tvmovaps %%zmm12, %%zmm20\n"
 110                          "\tvmovaps %%zmm12, %%zmm21\n"
 111                          "\tvmovaps %%zmm12, %%zmm22\n"
 112                          "\tvmovaps %%zmm12, %%zmm23\n"
 113                          "\tvmovaps %%zmm12, %%zmm30\n"
 114                          "\trdtscp\n"
 115                          "\tsalq $32, %%rdx\n"
 116                          "\tmovl %%eax, %%eax\n"
 117                          "\tmovq %%rdx, %%rbx\n"
 118                          "\torq %%rax, %%rbx\n"
 119                          "\tmovq %1, %%rdx\n"
 120                          "1:\n"
 121                          "\tvfmadd231pd %%zmm0, %%zmm0, %%zmm0\n"
 122                          "\tvfmadd231pd %%zmm1, %%zmm1, %%zmm1\n"
 123                          "\tvfmadd231pd %%zmm2, %%zmm2, %%zmm2\n"
 124                          "\tvfmadd231pd %%zmm3, %%zmm3, %%zmm3\n"
 125                          "\tvfmadd231pd %%zmm4, %%zmm4, %%zmm4\n"
 126                          "\tvfmadd231pd %%zmm5, %%zmm5, %%zmm5\n"
 127                          "\tvfmadd231pd %%zmm6, %%zmm6, %%zmm6\n"
 128                          "\tvfmadd231pd %%zmm7, %%zmm7, %%zmm7\n"
 129                          "\tvfmadd231pd %%zmm8, %%zmm8, %%zmm8\n"
 130                          "\tvfmadd231pd %%zmm9, %%zmm9, %%zmm9\n"
 131                          "\tvfmadd231pd %%zmm10, %%zmm10, %%zmm10\n"
 132                          "\tvfmadd231pd %%zmm11, %%zmm11, %%zmm11\n"
 133                          "\tvpermd %%zmm30, %%zmm30, %%zmm12\n"
 134                          "\tvpermd %%zmm30, %%zmm30, %%zmm13\n"
 135                          "\tvpermd %%zmm30, %%zmm30, %%zmm14\n"
 136                          "\tvpermd %%zmm30, %%zmm30, %%zmm15\n"
 137                          "\tvpermd %%zmm30, %%zmm30, %%zmm16\n"
 138                          "\tvpermd %%zmm30, %%zmm30, %%zmm17\n"
 139                          "\tvpermd %%zmm30, %%zmm30, %%zmm18\n"
 140                          "\tvpermd %%zmm30, %%zmm30, %%zmm19\n"
 141                          "\tvpermd %%zmm30, %%zmm30, %%zmm20\n"
 142                          "\tvpermd %%zmm30, %%zmm30, %%zmm21\n"
 143                          "\tvpermd %%zmm30, %%zmm30, %%zmm22\n"
 144                          "\tvpermd %%zmm30, %%zmm30, %%zmm23\n"
 145                          "\tdec %%rdx\n"
 146                          "\tjg 1b\n"
 147                          "\trdtscp\n"
 148                          "\tsalq $32, %%rdx\n"
 149                          "\tmovl %%eax, %%eax\n"
 150                          "\torq %%rax, %%rdx\n"
 151                          "\tsubq %%rbx, %%rdx\n"
 152                          "\tmovq %%rdx, %0\n"
 153                          : "=r" (cycles) : "r" (loopCount)
 154                          : "rax", "rbx", "rcx", "rdx", "zmm0", "zmm1", "zmm2", "zmm3",
 155                          "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10",
 156                          "zmm11", "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
 157                          "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23", "zmm30");
 158
 159     return cycles;
 160 }
 161
 162 /*\ brief Loop over FMA AVX512 instructions
 163  *
 164  * This function executes a meaningless loop that includes only
 165  * FMA instructions from the AVX512 instruction set.
 166  * We need a bit of complex logic to make sure it cannot be
 167  * optimized away by the compiler.
 168  *
 169  * \param loopCount  Number of iterations. Each iteration will
 170  *                   execute 12 FMA instructions.
 171  * \return Number of cycles used for the loop.
 172  */
 173 uint64_t
 174 timeFmaOnlyLoop(uint64_t loopCount)
 175 {
 176     uint64_t cycles;
 177     // Unfortunately we need to resort to inline ASM since we are
 178     // making a choice based on timing, and without efficient optimization
 179     // (e.g. when doing debugging) the usual intrinsics are often implemented
 180     // as independent load/store operations, which completely screws up timing.
 181     __asm__ __volatile__("\tvpxord %%zmm0, %%zmm0, %%zmm0\n"
 182                          "\tvmovaps %%zmm0, %%zmm1\n"
 183                          "\tvmovaps %%zmm0, %%zmm2\n"
 184                          "\tvmovaps %%zmm0, %%zmm3\n"
 185                          "\tvmovaps %%zmm0, %%zmm4\n"
 186                          "\tvmovaps %%zmm0, %%zmm5\n"
 187                          "\tvmovaps %%zmm0, %%zmm6\n"
 188                          "\tvmovaps %%zmm0, %%zmm7\n"
 189                          "\tvmovaps %%zmm0, %%zmm8\n"
 190                          "\tvmovaps %%zmm0, %%zmm9\n"
 191                          "\tvmovaps %%zmm0, %%zmm10\n"
 192                          "\tvmovaps %%zmm0, %%zmm11\n"
 193                          "\trdtscp\n"
 194                          "\tsalq $32, %%rdx\n"
 195                          "\tmovl %%eax, %%eax\n"
 196                          "\tmovq %%rdx, %%rbx\n"
 197                          "\torq %%rax, %%rbx\n"
 198                          "\tmovq %1, %%rdx\n"
 199                          "1:\n"
 200                          "\tvfmadd231pd %%zmm0, %%zmm0, %%zmm0\n"
 201                          "\tvfmadd231pd %%zmm1, %%zmm1, %%zmm1\n"
 202                          "\tvfmadd231pd %%zmm2, %%zmm2, %%zmm2\n"
 203                          "\tvfmadd231pd %%zmm3, %%zmm3, %%zmm3\n"
 204                          "\tvfmadd231pd %%zmm4, %%zmm4, %%zmm4\n"
 205                          "\tvfmadd231pd %%zmm5, %%zmm5, %%zmm5\n"
 206                          "\tvfmadd231pd %%zmm6, %%zmm6, %%zmm6\n"
 207                          "\tvfmadd231pd %%zmm7, %%zmm7, %%zmm7\n"
 208                          "\tvfmadd231pd %%zmm8, %%zmm8, %%zmm8\n"
 209                          "\tvfmadd231pd %%zmm9, %%zmm9, %%zmm9\n"
 210                          "\tvfmadd231pd %%zmm10, %%zmm10, %%zmm10\n"
 211                          "\tvfmadd231pd %%zmm11, %%zmm11, %%zmm11\n"
 212                          "\tdec %%rdx\n"
 213                          "\tjg 1b\n"
 214                          "\trdtscp\n"
 215                          "\tsalq $32, %%rdx\n"
 216                          "\tmovl %%eax, %%eax\n"
 217                          "\torq %%rax, %%rdx\n"
 218                          "\tsubq %%rbx, %%rdx\n"
 219                          "\tmovq %%rdx, %0\n"
 220                          : "=r" (cycles) : "r" (loopCount)
 221                          : "rax", "rbx", "rcx", "rdx", "zmm0", "zmm1", "zmm2", "zmm3",
 222                          "zmm4", "zmm5", "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11");
 223
 224     return cycles;
 225 }
 226
 227 int
 228 checkDualAvx512FmaUnits()
 229 {
 230     uint64_t timeFmaAndShuf = 1e9;             // Large value
 231
 232     // Make sure the CPU is in AVX512 mode by executing a fairly long loop.
 233     // Use the return value to make sure it is not optimized away. Later invocations
 234     // use fewer iterations, so they should always be faster.
 235     uint64_t timeFmaOnly    = timeFmaOnlyLoop(100000);
 236
 237     // Execute the loops three times
 238     for (int i = 0; i < 3; i++)
 239     {
 240         timeFmaAndShuf = std::min(timeFmaAndShuf, timeFmaAndShuffleLoop(1000) );
 241         timeFmaOnly    = std::min(timeFmaOnly, timeFmaOnlyLoop(1000) );
 242     }
 243
 244     return (timeFmaAndShuf > 1.5 * timeFmaOnly);
 245 }
 246
 247 #endif  // GMX_X86_GCC_INLINE_ASM && SIMD_AVX_512_CXX_SUPPORTED
 248
 249
 250 /*! \brief Mutex to guard the execution of the timing test
 251  *
 252  * We only execute the test once, and return the saved result
 253  * on subsequent calls.
 254  */
 255 std::mutex initMutex;
 256
 257 }   // namespace
 258
 259 int
 260 identifyAvx512FmaUnits()
 261 {
 262     static bool initialized = false;
 263     static int  result      = false;
 264
 265     if (!initialized)
 266     {
 267         std::lock_guard<std::mutex>  lock(initMutex);
 268
 269         if (!initialized)
 270         {
 271             // For the standalone test binary we assume it will
 272             // only be executed on AVX512 hardware, but for the
 273             // library version we check the hardware support.
 274 #ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
 275             bool haveAvx512Hardware = true;
 276 #else
 277             bool haveAvx512Hardware = CpuInfo::detect().feature(CpuInfo::Feature::X86_Avx512F);
 278 #endif
 279
 280             if (haveAvx512Hardware)
 281             {
 282 #if GMX_X86_GCC_INLINE_ASM && SIMD_AVX_512_CXX_SUPPORTED
 283                 result = checkDualAvx512FmaUnits() ? 2 : 1;
 284 #else
 285                 result = -1; // Cannot run the tests
 286 #endif
 287             }
 288             else
 289             {
 290                 result = 0; // Not AVX-512 hardware
 291             }
 292             initialized = true;
 293         }
 294     }
 295     return result;
 296 }
 297
 298 } // namespace gmx
 299
 300 #ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
 301 int
 302 main()
 303 {
 304     printf("%d\n", gmx::identifyAvx512FmaUnits());
 305     return 0;
 306 }
 307 #endif