2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
37 * \brief Implements a routine to check the number of AVX512 fma units
39 * Just as the CpuInfo code, we need to be able to compile this file in stand-alone mode
40 * to set the SIMD acceleration and similar things during CMake configuration.
43 #ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
47 #include "identifyavx512fmaunits.h"
49 #ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
59 #ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
60 # include "gromacs/hardware/cpuinfo.h"
69 #if GMX_X86_GCC_INLINE_ASM && SIMD_AVX_512_CXX_SUPPORTED
70 /*\ brief Loop over mixed FMA and shuffle AVX512 instructions
72 * This function executes a meaningless loop that includes both
73 * FMA and shuffle instructions from the AVX512 instruction set.
74 * We need a bit of complex logic to make sure it cannot be
75 * optimized away by the compiler.
77 * \param loopCount Number of iterations. Each iteration will
78 * execute 12 FMA and 12 shuffle instructions.
79 * \return Number of cycles used for the loop.
81 uint64_t timeFmaAndShuffleLoop(uint64_t loopCount)
84 // Unfortunately we need to resort to inline ASM since we are
85 // making a choice based on timing, and without efficient optimization
86 // (e.g. when doing debugging) the usual intrinsics are often implemented
87 // as independent load/store operations, which completely screws up timing.
89 "\tvpxord %%zmm0, %%zmm0, %%zmm0\n"
90 "\tvmovaps %%zmm0, %%zmm1\n"
91 "\tvmovaps %%zmm0, %%zmm2\n"
92 "\tvmovaps %%zmm0, %%zmm3\n"
93 "\tvmovaps %%zmm0, %%zmm4\n"
94 "\tvmovaps %%zmm0, %%zmm5\n"
95 "\tvmovaps %%zmm0, %%zmm6\n"
96 "\tvmovaps %%zmm0, %%zmm7\n"
97 "\tvmovaps %%zmm0, %%zmm8\n"
98 "\tvmovaps %%zmm0, %%zmm9\n"
99 "\tvmovaps %%zmm0, %%zmm10\n"
100 "\tvmovaps %%zmm0, %%zmm11\n"
101 "\tvpxord %%zmm12, %%zmm12, %%zmm12\n"
102 "\tvmovaps %%zmm12, %%zmm13\n"
103 "\tvmovaps %%zmm12, %%zmm14\n"
104 "\tvmovaps %%zmm12, %%zmm15\n"
105 "\tvmovaps %%zmm12, %%zmm16\n"
106 "\tvmovaps %%zmm12, %%zmm17\n"
107 "\tvmovaps %%zmm12, %%zmm18\n"
108 "\tvmovaps %%zmm12, %%zmm19\n"
109 "\tvmovaps %%zmm12, %%zmm20\n"
110 "\tvmovaps %%zmm12, %%zmm21\n"
111 "\tvmovaps %%zmm12, %%zmm22\n"
112 "\tvmovaps %%zmm12, %%zmm23\n"
113 "\tvmovaps %%zmm12, %%zmm30\n"
115 "\tsalq $32, %%rdx\n"
116 "\tmovl %%eax, %%eax\n"
117 "\tmovq %%rdx, %%rbx\n"
118 "\torq %%rax, %%rbx\n"
121 "\tvfmadd231pd %%zmm0, %%zmm0, %%zmm0\n"
122 "\tvfmadd231pd %%zmm1, %%zmm1, %%zmm1\n"
123 "\tvfmadd231pd %%zmm2, %%zmm2, %%zmm2\n"
124 "\tvfmadd231pd %%zmm3, %%zmm3, %%zmm3\n"
125 "\tvfmadd231pd %%zmm4, %%zmm4, %%zmm4\n"
126 "\tvfmadd231pd %%zmm5, %%zmm5, %%zmm5\n"
127 "\tvfmadd231pd %%zmm6, %%zmm6, %%zmm6\n"
128 "\tvfmadd231pd %%zmm7, %%zmm7, %%zmm7\n"
129 "\tvfmadd231pd %%zmm8, %%zmm8, %%zmm8\n"
130 "\tvfmadd231pd %%zmm9, %%zmm9, %%zmm9\n"
131 "\tvfmadd231pd %%zmm10, %%zmm10, %%zmm10\n"
132 "\tvfmadd231pd %%zmm11, %%zmm11, %%zmm11\n"
133 "\tvpermd %%zmm30, %%zmm30, %%zmm12\n"
134 "\tvpermd %%zmm30, %%zmm30, %%zmm13\n"
135 "\tvpermd %%zmm30, %%zmm30, %%zmm14\n"
136 "\tvpermd %%zmm30, %%zmm30, %%zmm15\n"
137 "\tvpermd %%zmm30, %%zmm30, %%zmm16\n"
138 "\tvpermd %%zmm30, %%zmm30, %%zmm17\n"
139 "\tvpermd %%zmm30, %%zmm30, %%zmm18\n"
140 "\tvpermd %%zmm30, %%zmm30, %%zmm19\n"
141 "\tvpermd %%zmm30, %%zmm30, %%zmm20\n"
142 "\tvpermd %%zmm30, %%zmm30, %%zmm21\n"
143 "\tvpermd %%zmm30, %%zmm30, %%zmm22\n"
144 "\tvpermd %%zmm30, %%zmm30, %%zmm23\n"
148 "\tsalq $32, %%rdx\n"
149 "\tmovl %%eax, %%eax\n"
150 "\torq %%rax, %%rdx\n"
151 "\tsubq %%rbx, %%rdx\n"
188 /*\ brief Loop over FMA AVX512 instructions
190 * This function executes a meaningless loop that includes only
191 * FMA instructions from the AVX512 instruction set.
192 * We need a bit of complex logic to make sure it cannot be
193 * optimized away by the compiler.
195 * \param loopCount Number of iterations. Each iteration will
196 * execute 12 FMA instructions.
197 * \return Number of cycles used for the loop.
199 uint64_t timeFmaOnlyLoop(uint64_t loopCount)
202 // Unfortunately we need to resort to inline ASM since we are
203 // making a choice based on timing, and without efficient optimization
204 // (e.g. when doing debugging) the usual intrinsics are often implemented
205 // as independent load/store operations, which completely screws up timing.
206 __asm__ __volatile__(
207 "\tvpxord %%zmm0, %%zmm0, %%zmm0\n"
208 "\tvmovaps %%zmm0, %%zmm1\n"
209 "\tvmovaps %%zmm0, %%zmm2\n"
210 "\tvmovaps %%zmm0, %%zmm3\n"
211 "\tvmovaps %%zmm0, %%zmm4\n"
212 "\tvmovaps %%zmm0, %%zmm5\n"
213 "\tvmovaps %%zmm0, %%zmm6\n"
214 "\tvmovaps %%zmm0, %%zmm7\n"
215 "\tvmovaps %%zmm0, %%zmm8\n"
216 "\tvmovaps %%zmm0, %%zmm9\n"
217 "\tvmovaps %%zmm0, %%zmm10\n"
218 "\tvmovaps %%zmm0, %%zmm11\n"
220 "\tsalq $32, %%rdx\n"
221 "\tmovl %%eax, %%eax\n"
222 "\tmovq %%rdx, %%rbx\n"
223 "\torq %%rax, %%rbx\n"
226 "\tvfmadd231pd %%zmm0, %%zmm0, %%zmm0\n"
227 "\tvfmadd231pd %%zmm1, %%zmm1, %%zmm1\n"
228 "\tvfmadd231pd %%zmm2, %%zmm2, %%zmm2\n"
229 "\tvfmadd231pd %%zmm3, %%zmm3, %%zmm3\n"
230 "\tvfmadd231pd %%zmm4, %%zmm4, %%zmm4\n"
231 "\tvfmadd231pd %%zmm5, %%zmm5, %%zmm5\n"
232 "\tvfmadd231pd %%zmm6, %%zmm6, %%zmm6\n"
233 "\tvfmadd231pd %%zmm7, %%zmm7, %%zmm7\n"
234 "\tvfmadd231pd %%zmm8, %%zmm8, %%zmm8\n"
235 "\tvfmadd231pd %%zmm9, %%zmm9, %%zmm9\n"
236 "\tvfmadd231pd %%zmm10, %%zmm10, %%zmm10\n"
237 "\tvfmadd231pd %%zmm11, %%zmm11, %%zmm11\n"
241 "\tsalq $32, %%rdx\n"
242 "\tmovl %%eax, %%eax\n"
243 "\torq %%rax, %%rdx\n"
244 "\tsubq %%rbx, %%rdx\n"
268 bool checkDualAvx512FmaUnits()
270 uint64_t timeFmaAndShuf = static_cast<uint64_t>(1e9); // Large value
272 // Make sure the CPU is in AVX512 mode by executing a fairly long loop.
273 // Use the return value to make sure it is not optimized away. Later invocations
274 // use fewer iterations, so they should always be faster.
275 uint64_t timeFmaOnly = timeFmaOnlyLoop(100000);
277 // Execute the loops three times
278 for (int i = 0; i < 3; i++)
280 timeFmaAndShuf = std::min(timeFmaAndShuf, timeFmaAndShuffleLoop(1000));
281 timeFmaOnly = std::min(timeFmaOnly, timeFmaOnlyLoop(1000));
284 return timeFmaAndShuf > 1.5 * timeFmaOnly;
287 #endif // GMX_X86_GCC_INLINE_ASM && SIMD_AVX_512_CXX_SUPPORTED
290 /*! \brief Mutex to guard the execution of the timing test
292 * We only execute the test once, and return the saved result
293 * on subsequent calls.
295 // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
296 std::mutex initMutex;
300 int identifyAvx512FmaUnits()
302 static bool initialized = false;
303 static int result = 0;
307 std::lock_guard<std::mutex> lock(initMutex);
311 // For the standalone test binary we assume it will
312 // only be executed on AVX512 hardware, but for the
313 // library version we check the hardware support.
314 #ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
315 bool haveAvx512Hardware = true;
317 bool haveAvx512Hardware = CpuInfo::detect().feature(CpuInfo::Feature::X86_Avx512F);
320 if (haveAvx512Hardware)
322 #if GMX_X86_GCC_INLINE_ASM && SIMD_AVX_512_CXX_SUPPORTED
323 result = checkDualAvx512FmaUnits() ? 2 : 1;
325 result = -1; // Cannot run the tests
330 result = 0; // Not AVX-512 hardware
340 #ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
343 printf("%d\n", gmx::identifyAvx512FmaUnits());