2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
38 * \brief Implements SIMD architecture support query routines
40 * \author Erik Lindahl <erik.lindahl@scilifelab.se>
42 * \ingroup module_simd
61 #include "gromacs/hardware/cpuinfo.h"
62 #include "gromacs/hardware/identifyavx512fmaunits.h"
63 #include "gromacs/utility/fatalerror.h"
64 #include "gromacs/utility/stringutil.h"
71 const std::string& simdString(SimdType s)
73 static const std::map<SimdType, std::string> name = {
74 { SimdType::None, "None" },
75 { SimdType::Reference, "Reference" },
76 { SimdType::Generic, "Generic" },
77 { SimdType::X86_Sse2, "SSE2" },
78 { SimdType::X86_Sse4_1, "SSE4.1" },
79 { SimdType::X86_Avx128Fma, "AVX_128_FMA" },
80 { SimdType::X86_Avx, "AVX_256" },
81 { SimdType::X86_Avx2, "AVX2_256" },
82 { SimdType::X86_Avx2_128, "AVX2_128" },
83 { SimdType::X86_Avx512, "AVX_512" },
84 { SimdType::X86_Avx512Knl, "AVX_512_KNL" },
85 { SimdType::X86_Mic, "X86_MIC" },
86 { SimdType::Arm_Neon, "ARM_NEON" },
87 { SimdType::Arm_NeonAsimd, "ARM_NEON_ASIMD" },
88 { SimdType::Arm_Sve, "ARM_SVE" },
89 { SimdType::Ibm_Vmx, "IBM_VMX" },
90 { SimdType::Ibm_Vsx, "IBM_VSX" },
91 { SimdType::Fujitsu_HpcAce, "Fujitsu HPC-ACE" }
101 //! Helper to detect correct AMD Zen architecture.
102 bool cpuIsAmdZen1(const CpuInfo& cpuInfo)
104 // Both Zen/Zen+/Zen2 have family==23
105 // Model numbers for Zen:
106 // 1) Naples, Whitehaven, Summit ridge, and Snowy Owl
108 // Model numbers for Zen+:
111 return (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Amd && cpuInfo.family() == 23
112 && (cpuInfo.model() == 1 || cpuInfo.model() == 17 || cpuInfo.model() == 8
113 || cpuInfo.model() == 24));
119 SimdType simdSuggested(const CpuInfo& c)
121 SimdType suggested = SimdType::None;
123 if (c.supportLevel() >= CpuInfo::SupportLevel::Features)
127 case CpuInfo::Vendor::Intel:
128 if (c.feature(CpuInfo::Feature::X86_Avx512ER))
130 suggested = SimdType::X86_Avx512Knl;
132 else if (c.feature(CpuInfo::Feature::X86_Avx512F))
134 // If we could not identify the number of AVX512 FMA units we assume 2
135 suggested = (identifyAvx512FmaUnits() == 1) ? SimdType::X86_Avx2 : SimdType::X86_Avx512;
137 else if (c.feature(CpuInfo::Feature::X86_Avx2))
139 suggested = SimdType::X86_Avx2;
141 else if (c.feature(CpuInfo::Feature::X86_Avx))
143 suggested = SimdType::X86_Avx;
145 else if (c.feature(CpuInfo::Feature::X86_Sse4_1))
147 suggested = SimdType::X86_Sse4_1;
149 else if (c.feature(CpuInfo::Feature::X86_Sse2))
151 suggested = SimdType::X86_Sse2;
154 case CpuInfo::Vendor::Amd:
155 case CpuInfo::Vendor::Hygon:
156 if (c.feature(CpuInfo::Feature::X86_Avx2))
158 // AMD Zen supports 256-bit AVX2, but Zen1 performs better with 128-bit
159 // since it can execute two independent such instructions per cycle,
160 // and wider SIMD has slightly lower efficiency in GROMACS.
161 // However... Zen2 supports full-width execution of 256-bit AVX2,
162 // so we only want to apply this hack to Zen/Zen+.
163 suggested = cpuIsAmdZen1(c) ? SimdType::X86_Avx2_128 : SimdType::X86_Avx2;
165 else if (c.feature(CpuInfo::Feature::X86_Avx))
167 // Use 128-bit FMA SIMD if Fma4 flag is set, otherwise plain 256-bit AVX
168 if (c.feature(CpuInfo::Feature::X86_Fma4))
170 suggested = SimdType::X86_Avx128Fma;
174 suggested = SimdType::X86_Avx;
177 else if (c.feature(CpuInfo::Feature::X86_Sse4_1))
179 suggested = SimdType::X86_Sse4_1;
181 else if (c.feature(CpuInfo::Feature::X86_Sse2))
183 suggested = SimdType::X86_Sse2;
187 case CpuInfo::Vendor::Arm:
188 if (c.feature(CpuInfo::Feature::Arm_Sve))
190 suggested = SimdType::Arm_Sve;
192 else if (c.feature(CpuInfo::Feature::Arm_NeonAsimd))
194 suggested = SimdType::Arm_NeonAsimd;
196 else if (c.feature(CpuInfo::Feature::Arm_Neon))
198 suggested = SimdType::Arm_Neon;
201 case CpuInfo::Vendor::Ibm:
202 if (c.feature(CpuInfo::Feature::Ibm_Vsx))
204 suggested = SimdType::Ibm_Vsx;
206 else if (c.feature(CpuInfo::Feature::Ibm_Vmx))
208 suggested = SimdType::Ibm_Vmx;
211 case CpuInfo::Vendor::Fujitsu:
212 if (c.feature(CpuInfo::Feature::Fujitsu_HpcAce))
214 suggested = SimdType::Fujitsu_HpcAce;
223 SimdType simdCompiled()
225 #if GMX_SIMD_X86_AVX_512_KNL
226 return SimdType::X86_Avx512Knl;
227 #elif GMX_SIMD_X86_AVX_512
228 return SimdType::X86_Avx512;
229 #elif GMX_SIMD_X86_MIC
230 return SimdType::X86_Mic;
231 #elif GMX_SIMD_X86_AVX2_256
232 return SimdType::X86_Avx2;
233 #elif GMX_SIMD_X86_AVX2_128
234 return SimdType::X86_Avx2_128;
235 #elif GMX_SIMD_X86_AVX_256
236 return SimdType::X86_Avx;
237 #elif GMX_SIMD_X86_AVX_128_FMA
238 return SimdType::X86_Avx128Fma;
239 #elif GMX_SIMD_X86_SSE4_1
240 return SimdType::X86_Sse4_1;
241 #elif GMX_SIMD_X86_SSE2
242 return SimdType::X86_Sse2;
243 #elif GMX_SIMD_ARM_NEON
244 return SimdType::Arm_Neon;
245 #elif GMX_SIMD_ARM_NEON_ASIMD
246 return SimdType::Arm_NeonAsimd;
247 #elif GMX_SIMD_ARM_SVE
248 return SimdType::Arm_Sve;
249 #elif GMX_SIMD_IBM_VMX
250 return SimdType::Ibm_Vmx;
251 #elif GMX_SIMD_IBM_VSX
252 return SimdType::Ibm_Vsx;
253 #elif GMX_SIMD_SPARC64_HPC_ACE
254 return SimdType::Fujitsu_HpcAce;
255 #elif GMX_SIMD_REFERENCE
256 return SimdType::Reference;
258 return SimdType::None;
262 bool simdCheck(gmx::SimdType wanted, FILE* log, bool warnToStdErr)
264 SimdType compiled = simdCompiled();
266 gmx::TextLineWrapper wrapper;
270 wrapper.settings().setLineLength(78);
272 if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx512)
274 logMsg = wrapper.wrapToString(formatString(
275 "Highest SIMD level requested by all nodes in run: %s\n"
276 "SIMD instructions selected at compile time: %s\n"
277 "This program was compiled for different hardware than you are running on, "
278 "which could influence performance. This build might have been configured on "
279 "a login node with only a single AVX-512 FMA unit (in which case AVX2 is faster), "
280 "while the node you are running on has dual AVX-512 FMA units.",
281 simdString(wanted).c_str(), simdString(compiled).c_str()));
282 warnMsg = wrapper.wrapToString(formatString(
283 "Compiled SIMD: %s, but for this host/run %s might be better (see log).",
284 simdString(compiled).c_str(), simdString(wanted).c_str()));
286 else if (compiled == SimdType::X86_Avx512 && wanted == SimdType::X86_Avx2
287 && identifyAvx512FmaUnits() == 1)
289 // The reason for explicitly checking the number of FMA units above is to avoid triggering
290 // this conditional if the AVX2 SIMD was requested by some other node in a heterogeneous MPI run.
291 logMsg = wrapper.wrapToString(formatString(
292 "Highest SIMD level requested by all nodes in run: %s\n"
293 "SIMD instructions selected at compile time: %s\n"
294 "This program was compiled for different hardware than you are running on, "
295 "which could influence performance."
296 "This host supports AVX-512, but since it only has 1 AVX-512"
297 "FMA unit, it would be faster to use AVX2 instead.",
298 simdString(wanted).c_str(), simdString(compiled).c_str()));
299 warnMsg = wrapper.wrapToString(formatString(
300 "Compiled SIMD: %s, but for this host/run %s might be better (see log).",
301 simdString(compiled).c_str(), simdString(wanted).c_str()));
303 else if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx2_128)
305 // Wanted SimdType::X86_Avx2_128 can only be the AMD Zen architecture.
306 // AVX2_256 is only up to a few percent slower than AVX2_128
307 // in both single and double precision. AVX2_256 is slightly
308 // faster with nonbondeds and PME on a GPU. Don't warn the user.
310 else if (compiled > wanted && !(compiled == SimdType::X86_Avx && wanted == SimdType::X86_Avx128Fma))
312 // Normally it is close to catastrophic if the compiled SIMD type is larger than
313 // the supported one, but AVX128Fma is an exception: AMD CPUs will (strongly) prefer
314 // AVX128Fma, but they will work fine with AVX too. Thus, make an exception for this.
315 logMsg = wrapper.wrapToString(
316 formatString("Highest SIMD level requested by all nodes in run: %s\n"
317 "SIMD instructions selected at compile time: %s\n"
318 "Compiled SIMD newer than requested; program might crash.",
319 simdString(wanted).c_str(), simdString(compiled).c_str()));
322 else if (wanted != compiled)
324 // This warning will also occur if compiled is X86_Avx and wanted is X86_Avx128Fma
325 logMsg = wrapper.wrapToString(formatString(
326 "Highest SIMD level requested by all nodes in run: %s\n"
327 "SIMD instructions selected at compile time: %s\n"
328 "This program was compiled for different hardware than you are running on, "
329 "which could influence performance.",
330 simdString(wanted).c_str(), simdString(compiled).c_str()));
331 warnMsg = wrapper.wrapToString(formatString(
332 "Compiled SIMD: %s, but for this host/run %s might be better (see log).",
333 simdString(compiled).c_str(), simdString(wanted).c_str()));
336 else if ((compiled == SimdType::Arm_Sve) && (svcntb() != GMX_SIMD_ARM_SVE_LENGTH_VALUE / 8))
338 logMsg = wrapper.wrapToString(formatString(
339 "Longest SVE length requested by all nodes in run: %d\n"
340 "SVE length selected at compile time: %ld\n"
341 "This program was compiled for different hardware than you are running on, "
342 "which will lead to incorrect behavior.\n"
344 GMX_SIMD_ARM_SVE_LENGTH_VALUE, svcntb() * 8));
345 warnMsg = wrapper.wrapToString(formatString(
346 "Compiled SVE Length: %d, but for this process requires %ld (see log).",
347 GMX_SIMD_ARM_SVE_LENGTH_VALUE, svcntb() * 8));
351 if (!logMsg.empty() && log != nullptr)
353 fprintf(log, "%s\n", logMsg.c_str());
355 if (!warnMsg.empty() && warnToStdErr)
357 fprintf(stderr, "%s\n", warnMsg.c_str());
360 if ((compiled == SimdType::Arm_Sve) && (svcntb() != GMX_SIMD_ARM_SVE_LENGTH_VALUE / 8))
362 gmx_exit_on_fatal_error(ExitType_Abort, 1);
366 return (wanted == compiled);