2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief Implements SIMD architecture support query routines
40 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
42 * \ingroup module_simd
57 #include "gromacs/hardware/cpuinfo.h"
58 #include "gromacs/hardware/identifyavx512fmaunits.h"
59 #include "gromacs/utility/stringutil.h"
67 simdString(SimdType s)
69 static const std::map<SimdType, std::string> name =
71 { SimdType::None, "None" },
72 { SimdType::Reference, "Reference" },
73 { SimdType::Generic, "Generic" },
74 { SimdType::X86_Sse2, "SSE2" },
75 { SimdType::X86_Sse4_1, "SSE4.1" },
76 { SimdType::X86_Avx128Fma, "AVX_128_FMA" },
77 { SimdType::X86_Avx, "AVX_256" },
78 { SimdType::X86_Avx2, "AVX2_256" },
79 { SimdType::X86_Avx2_128, "AVX2_128" },
80 { SimdType::X86_Avx512, "AVX_512" },
81 { SimdType::X86_Avx512Knl, "AVX_512_KNL" },
82 { SimdType::X86_Mic, "X86_MIC" },
83 { SimdType::Arm_Neon, "ARM_NEON" },
84 { SimdType::Arm_NeonAsimd, "ARM_NEON_ASIMD" },
85 { SimdType::Ibm_Vmx, "IBM_VMX" },
86 { SimdType::Ibm_Vsx, "IBM_VSX" },
87 { SimdType::Fujitsu_HpcAce, "Fujitsu HPC-ACE" }
97 //! Helper to detect correct AMD Zen architecture.
99 cpuIsAmdZen1(const CpuInfo &cpuInfo)
101 // Both Zen/Zen+/Zen2 have family==23
102 // Model numbers for Zen:
103 // 1) Naples, Whitehaven, Summit ridge, and Snowy Owl
105 // Model numbers for Zen+:
108 return (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Amd &&
109 cpuInfo.family() == 23 &&
110 (cpuInfo.model() == 1 || cpuInfo.model() == 17 ||
111 cpuInfo.model() == 8 || cpuInfo.model() == 24) );
118 simdSuggested(const CpuInfo &c)
120 SimdType suggested = SimdType::None;
122 if (c.supportLevel() >= CpuInfo::SupportLevel::Features)
126 case CpuInfo::Vendor::Intel:
127 if (c.feature(CpuInfo::Feature::X86_Avx512ER))
129 suggested = SimdType::X86_Avx512Knl;
131 else if (c.feature(CpuInfo::Feature::X86_Avx512F))
133 // If we could not identify the number of AVX512 FMA units we assume 2
134 suggested = ( identifyAvx512FmaUnits() == 1 ) ? SimdType::X86_Avx2 : SimdType::X86_Avx512;
136 else if (c.feature(CpuInfo::Feature::X86_Avx2))
138 suggested = SimdType::X86_Avx2;
140 else if (c.feature(CpuInfo::Feature::X86_Avx))
142 suggested = SimdType::X86_Avx;
144 else if (c.feature(CpuInfo::Feature::X86_Sse4_1))
146 suggested = SimdType::X86_Sse4_1;
148 else if (c.feature(CpuInfo::Feature::X86_Sse2))
150 suggested = SimdType::X86_Sse2;
153 case CpuInfo::Vendor::Amd:
154 case CpuInfo::Vendor::Hygon:
155 if (c.feature(CpuInfo::Feature::X86_Avx2))
157 // AMD Zen supports 256-bit AVX2, but Zen1 performs better with 128-bit
158 // since it can execute two independent such instructions per cycle,
159 // and wider SIMD has slightly lower efficiency in GROMACS.
160 // However... Zen2 supports full-width execution of 256-bit AVX2,
161 // so we only want to apply this hack to Zen/Zen+.
162 suggested = cpuIsAmdZen1(c) ? SimdType::X86_Avx2_128 : SimdType::X86_Avx2;
164 else if (c.feature(CpuInfo::Feature::X86_Avx))
166 // Use 128-bit FMA SIMD if Fma4 flag is set, otherwise plain 256-bit AVX
167 if (c.feature(CpuInfo::Feature::X86_Fma4))
169 suggested = SimdType::X86_Avx128Fma;
173 suggested = SimdType::X86_Avx;
176 else if (c.feature(CpuInfo::Feature::X86_Sse4_1))
178 suggested = SimdType::X86_Sse4_1;
180 else if (c.feature(CpuInfo::Feature::X86_Sse2))
182 suggested = SimdType::X86_Sse2;
186 case CpuInfo::Vendor::Arm:
187 if (c.feature(CpuInfo::Feature::Arm_NeonAsimd))
189 suggested = SimdType::Arm_NeonAsimd;
191 else if (c.feature(CpuInfo::Feature::Arm_Neon))
193 suggested = SimdType::Arm_Neon;
196 case CpuInfo::Vendor::Ibm:
197 if (c.feature(CpuInfo::Feature::Ibm_Vsx))
199 suggested = SimdType::Ibm_Vsx;
201 else if (c.feature(CpuInfo::Feature::Ibm_Vmx))
203 suggested = SimdType::Ibm_Vmx;
206 case CpuInfo::Vendor::Fujitsu:
207 if (c.feature(CpuInfo::Feature::Fujitsu_HpcAce))
209 suggested = SimdType::Fujitsu_HpcAce;
222 #if GMX_SIMD_X86_AVX_512_KNL
223 return SimdType::X86_Avx512Knl;
224 #elif GMX_SIMD_X86_AVX_512
225 return SimdType::X86_Avx512;
226 #elif GMX_SIMD_X86_MIC
227 return SimdType::X86_Mic;
228 #elif GMX_SIMD_X86_AVX2_256
229 return SimdType::X86_Avx2;
230 #elif GMX_SIMD_X86_AVX2_128
231 return SimdType::X86_Avx2_128;
232 #elif GMX_SIMD_X86_AVX_256
233 return SimdType::X86_Avx;
234 #elif GMX_SIMD_X86_AVX_128_FMA
235 return SimdType::X86_Avx128Fma;
236 #elif GMX_SIMD_X86_SSE4_1
237 return SimdType::X86_Sse4_1;
238 #elif GMX_SIMD_X86_SSE2
239 return SimdType::X86_Sse2;
240 #elif GMX_SIMD_ARM_NEON
241 return SimdType::Arm_Neon;
242 #elif GMX_SIMD_ARM_NEON_ASIMD
243 return SimdType::Arm_NeonAsimd;
244 #elif GMX_SIMD_IBM_VMX
245 return SimdType::Ibm_Vmx;
246 #elif GMX_SIMD_IBM_VSX
247 return SimdType::Ibm_Vsx;
248 #elif GMX_SIMD_SPARC64_HPC_ACE
249 return SimdType::Fujitsu_HpcAce;
250 #elif GMX_SIMD_REFERENCE
251 return SimdType::Reference;
253 return SimdType::None;
258 simdCheck(gmx::SimdType wanted,
262 SimdType compiled = simdCompiled();
264 gmx::TextLineWrapper wrapper;
268 wrapper.settings().setLineLength(78);
270 if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx512)
272 logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
273 "SIMD instructions selected at compile time: %s\n"
274 "This program was compiled for different hardware than you are running on, "
275 "which could influence performance. This build might have been configured on "
276 "a login node with only a single AVX-512 FMA unit (in which case AVX2 is faster), "
277 "while the node you are running on has dual AVX-512 FMA units.",
278 simdString(wanted).c_str(), simdString(compiled).c_str()));
279 warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
280 simdString(compiled).c_str(), simdString(wanted).c_str()));
282 else if (compiled == SimdType::X86_Avx512 && wanted == SimdType::X86_Avx2 && identifyAvx512FmaUnits() == 1)
284 // The reason for explicitly checking the number of FMA units above is to avoid triggering
285 // this conditional if the AVX2 SIMD was requested by some other node in a heterogeneous MPI run.
286 logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
287 "SIMD instructions selected at compile time: %s\n"
288 "This program was compiled for different hardware than you are running on, "
289 "which could influence performance."
290 "This host supports AVX-512, but since it only has 1 AVX-512"
291 "FMA unit, it would be faster to use AVX2 instead.",
292 simdString(wanted).c_str(), simdString(compiled).c_str()));
293 warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
294 simdString(compiled).c_str(), simdString(wanted).c_str()));
296 else if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx2_128)
298 // Wanted SimdType::X86_Avx2_128 can only be the AMD Zen architecture.
299 // AVX2_256 is only up to a few percent slower than AVX2_128
300 // in both single and double precision. AVX2_256 is slightly
301 // faster with nonbondeds and PME on a GPU. Don't warn the user.
303 else if (compiled > wanted && !(compiled == SimdType::X86_Avx && wanted == SimdType::X86_Avx128Fma))
305 // Normally it is close to catastrophic if the compiled SIMD type is larger than
306 // the supported one, but AVX128Fma is an exception: AMD CPUs will (strongly) prefer
307 // AVX128Fma, but they will work fine with AVX too. Thus, make an exception for this.
308 logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
309 "SIMD instructions selected at compile time: %s\n"
310 "Compiled SIMD newer than requested; program might crash.",
311 simdString(wanted).c_str(), simdString(compiled).c_str()));
314 else if (wanted != compiled)
316 // This warning will also occur if compiled is X86_Avx and wanted is X86_Avx128Fma
317 logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
318 "SIMD instructions selected at compile time: %s\n"
319 "This program was compiled for different hardware than you are running on, "
320 "which could influence performance.",
321 simdString(wanted).c_str(), simdString(compiled).c_str()));
322 warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
323 simdString(compiled).c_str(), simdString(wanted).c_str()));
326 if (!logMsg.empty() && log != nullptr)
328 fprintf(log, "%s\n", logMsg.c_str());
330 if (!warnMsg.empty() && warnToStdErr)
332 fprintf(stderr, "%s\n", warnMsg.c_str());
335 return (wanted == compiled);