Merge branch release-2019 into master
[alexxy/gromacs.git] / src / gromacs / simd / support.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36 /*! \internal \file
37  *
38  * \brief Implements SIMD architecture support query routines
39  *
40  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
41  *
42  * \ingroup module_simd
43  */
44
45 #include "gmxpre.h"
46
47 #include "support.h"
48
49 #include "config.h"
50
51 #include <cstdio>
52 #include <cstdlib>
53
54 #include <map>
55 #include <string>
56
57 #include "gromacs/hardware/cpuinfo.h"
58 #include "gromacs/hardware/identifyavx512fmaunits.h"
59 #include "gromacs/utility/stringutil.h"
60
61 namespace gmx
62 {
63
64 /*! \cond libapi */
65
66 const std::string &
67 simdString(SimdType s)
68 {
69     static const std::map<SimdType, std::string> name =
70     {
71         { SimdType::None,           "None"            },
72         { SimdType::Reference,      "Reference"       },
73         { SimdType::Generic,        "Generic"         },
74         { SimdType::X86_Sse2,       "SSE2"            },
75         { SimdType::X86_Sse4_1,     "SSE4.1"          },
76         { SimdType::X86_Avx128Fma,  "AVX_128_FMA"     },
77         { SimdType::X86_Avx,        "AVX_256"         },
78         { SimdType::X86_Avx2,       "AVX2_256"        },
79         { SimdType::X86_Avx2_128,   "AVX2_128"        },
80         { SimdType::X86_Avx512,     "AVX_512"         },
81         { SimdType::X86_Avx512Knl,  "AVX_512_KNL"     },
82         { SimdType::X86_Mic,        "X86_MIC"         },
83         { SimdType::Arm_Neon,       "ARM_NEON"        },
84         { SimdType::Arm_NeonAsimd,  "ARM_NEON_ASIMD"  },
85         { SimdType::Ibm_Vmx,        "IBM_VMX"         },
86         { SimdType::Ibm_Vsx,        "IBM_VSX"         },
87         { SimdType::Fujitsu_HpcAce, "Fujitsu HPC-ACE" }
88     };
89
90     return name.at(s);
91 }
92
93 namespace
94 {
95
96
97 //! Helper to detect correct AMD Zen architecture.
98 bool
99 cpuIsAmdZen1(const CpuInfo &cpuInfo)
100 {
101     // Both Zen/Zen+/Zen2 have family==23
102     // Model numbers for Zen:
103     // 1)  Naples, Whitehaven, Summit ridge, and Snowy Owl
104     // 17) Raven ridge
105     // Model numbers for Zen+:
106     // 8)  Pinnacle Ridge
107     // 24) Picasso
108     return (cpuInfo.vendor() == gmx::CpuInfo::Vendor::Amd &&
109             cpuInfo.family() == 23 &&
110             (cpuInfo.model() == 1 || cpuInfo.model() == 17 ||
111              cpuInfo.model() == 8 || cpuInfo.model() == 24) );
112 }
113
114 }   // namespace
115
116
117 SimdType
118 simdSuggested(const CpuInfo &c)
119 {
120     SimdType suggested = SimdType::None;
121
122     if (c.supportLevel() >= CpuInfo::SupportLevel::Features)
123     {
124         switch (c.vendor())
125         {
126             case CpuInfo::Vendor::Intel:
127                 if (c.feature(CpuInfo::Feature::X86_Avx512ER))
128                 {
129                     suggested = SimdType::X86_Avx512Knl;
130                 }
131                 else if (c.feature(CpuInfo::Feature::X86_Avx512F))
132                 {
133                     // If we could not identify the number of AVX512 FMA units we assume 2
134                     suggested = ( identifyAvx512FmaUnits() == 1 ) ? SimdType::X86_Avx2 : SimdType::X86_Avx512;
135                 }
136                 else if (c.feature(CpuInfo::Feature::X86_Avx2))
137                 {
138                     suggested = SimdType::X86_Avx2;
139                 }
140                 else if (c.feature(CpuInfo::Feature::X86_Avx))
141                 {
142                     suggested = SimdType::X86_Avx;
143                 }
144                 else if (c.feature(CpuInfo::Feature::X86_Sse4_1))
145                 {
146                     suggested = SimdType::X86_Sse4_1;
147                 }
148                 else if (c.feature(CpuInfo::Feature::X86_Sse2))
149                 {
150                     suggested = SimdType::X86_Sse2;
151                 }
152                 break;
153             case CpuInfo::Vendor::Amd:
154             case CpuInfo::Vendor::Hygon:
155                 if (c.feature(CpuInfo::Feature::X86_Avx2))
156                 {
157                     // AMD Zen supports 256-bit AVX2, but Zen1 performs better with 128-bit
158                     // since it can execute two independent such instructions per cycle,
159                     // and wider SIMD has slightly lower efficiency in GROMACS.
160                     // However... Zen2 supports full-width execution of 256-bit AVX2,
161                     // so we only want to apply this hack to Zen/Zen+.
162                     suggested = cpuIsAmdZen1(c) ? SimdType::X86_Avx2_128 : SimdType::X86_Avx2;
163                 }
164                 else if (c.feature(CpuInfo::Feature::X86_Avx))
165                 {
166                     // Use 128-bit FMA SIMD if Fma4 flag is set, otherwise plain 256-bit AVX
167                     if (c.feature(CpuInfo::Feature::X86_Fma4))
168                     {
169                         suggested = SimdType::X86_Avx128Fma;
170                     }
171                     else
172                     {
173                         suggested = SimdType::X86_Avx;
174                     }
175                 }
176                 else if (c.feature(CpuInfo::Feature::X86_Sse4_1))
177                 {
178                     suggested = SimdType::X86_Sse4_1;
179                 }
180                 else if (c.feature(CpuInfo::Feature::X86_Sse2))
181                 {
182                     suggested = SimdType::X86_Sse2;
183                 }
184
185                 break;
186             case CpuInfo::Vendor::Arm:
187                 if (c.feature(CpuInfo::Feature::Arm_NeonAsimd))
188                 {
189                     suggested = SimdType::Arm_NeonAsimd;
190                 }
191                 else if (c.feature(CpuInfo::Feature::Arm_Neon))
192                 {
193                     suggested = SimdType::Arm_Neon;
194                 }
195                 break;
196             case CpuInfo::Vendor::Ibm:
197                 if (c.feature(CpuInfo::Feature::Ibm_Vsx))
198                 {
199                     suggested = SimdType::Ibm_Vsx;
200                 }
201                 else if (c.feature(CpuInfo::Feature::Ibm_Vmx))
202                 {
203                     suggested = SimdType::Ibm_Vmx;
204                 }
205                 break;
206             case CpuInfo::Vendor::Fujitsu:
207                 if (c.feature(CpuInfo::Feature::Fujitsu_HpcAce))
208                 {
209                     suggested = SimdType::Fujitsu_HpcAce;
210                 }
211                 break;
212             default:
213                 break;
214         }
215     }
216     return suggested;
217 }
218
219 SimdType
220 simdCompiled()
221 {
222 #if GMX_SIMD_X86_AVX_512_KNL
223     return SimdType::X86_Avx512Knl;
224 #elif GMX_SIMD_X86_AVX_512
225     return SimdType::X86_Avx512;
226 #elif GMX_SIMD_X86_MIC
227     return SimdType::X86_Mic;
228 #elif GMX_SIMD_X86_AVX2_256
229     return SimdType::X86_Avx2;
230 #elif GMX_SIMD_X86_AVX2_128
231     return SimdType::X86_Avx2_128;
232 #elif GMX_SIMD_X86_AVX_256
233     return SimdType::X86_Avx;
234 #elif GMX_SIMD_X86_AVX_128_FMA
235     return SimdType::X86_Avx128Fma;
236 #elif GMX_SIMD_X86_SSE4_1
237     return SimdType::X86_Sse4_1;
238 #elif GMX_SIMD_X86_SSE2
239     return SimdType::X86_Sse2;
240 #elif GMX_SIMD_ARM_NEON
241     return SimdType::Arm_Neon;
242 #elif GMX_SIMD_ARM_NEON_ASIMD
243     return SimdType::Arm_NeonAsimd;
244 #elif GMX_SIMD_IBM_VMX
245     return SimdType::Ibm_Vmx;
246 #elif GMX_SIMD_IBM_VSX
247     return SimdType::Ibm_Vsx;
248 #elif GMX_SIMD_SPARC64_HPC_ACE
249     return SimdType::Fujitsu_HpcAce;
250 #elif GMX_SIMD_REFERENCE
251     return SimdType::Reference;
252 #else
253     return SimdType::None;
254 #endif
255 }
256
257 bool
258 simdCheck(gmx::SimdType    wanted,
259           FILE *           log,
260           bool             warnToStdErr)
261 {
262     SimdType             compiled  = simdCompiled();
263
264     gmx::TextLineWrapper wrapper;
265     std::string          logMsg;
266     std::string          warnMsg;
267
268     wrapper.settings().setLineLength(78);
269
270     if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx512)
271     {
272         logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
273                                                    "SIMD instructions selected at compile time:       %s\n"
274                                                    "This program was compiled for different hardware than you are running on, "
275                                                    "which could influence performance. This build might have been configured on "
276                                                    "a login node with only a single AVX-512 FMA unit (in which case AVX2 is faster), "
277                                                    "while the node you are running on has dual AVX-512 FMA units.",
278                                                    simdString(wanted).c_str(), simdString(compiled).c_str()));
279         warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
280                                                     simdString(compiled).c_str(), simdString(wanted).c_str()));
281     }
282     else if (compiled == SimdType::X86_Avx512 && wanted == SimdType::X86_Avx2 && identifyAvx512FmaUnits() == 1)
283     {
284         // The reason for explicitly checking the number of FMA units above is to avoid triggering
285         // this conditional if the AVX2 SIMD was requested by some other node in a heterogeneous MPI run.
286         logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
287                                                    "SIMD instructions selected at compile time:       %s\n"
288                                                    "This program was compiled for different hardware than you are running on, "
289                                                    "which could influence performance."
290                                                    "This host supports AVX-512, but since it only has 1 AVX-512"
291                                                    "FMA unit, it would be faster to use AVX2 instead.",
292                                                    simdString(wanted).c_str(), simdString(compiled).c_str()));
293         warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
294                                                     simdString(compiled).c_str(), simdString(wanted).c_str()));
295     }
296     else if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx2_128)
297     {
298         // Wanted SimdType::X86_Avx2_128 can only be the AMD Zen architecture.
299         // AVX2_256 is only up to a few percent slower than AVX2_128
300         // in both single and double precision. AVX2_256 is slightly
301         // faster with nonbondeds and PME on a GPU. Don't warn the user.
302     }
303     else if (compiled > wanted && !(compiled == SimdType::X86_Avx && wanted == SimdType::X86_Avx128Fma))
304     {
305         // Normally it is close to catastrophic if the compiled SIMD type is larger than
306         // the supported one, but AVX128Fma is an exception: AMD CPUs will (strongly) prefer
307         // AVX128Fma, but they will work fine with AVX too. Thus, make an exception for this.
308         logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
309                                                    "SIMD instructions selected at compile time:       %s\n"
310                                                    "Compiled SIMD newer than requested; program might crash.",
311                                                    simdString(wanted).c_str(), simdString(compiled).c_str()));
312         warnMsg = logMsg;
313     }
314     else if (wanted != compiled)
315     {
316         // This warning will also occur if compiled is X86_Avx and wanted is X86_Avx128Fma
317         logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
318                                                    "SIMD instructions selected at compile time:       %s\n"
319                                                    "This program was compiled for different hardware than you are running on, "
320                                                    "which could influence performance.",
321                                                    simdString(wanted).c_str(), simdString(compiled).c_str()));
322         warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
323                                                     simdString(compiled).c_str(), simdString(wanted).c_str()));
324     }
325
326     if (!logMsg.empty() && log != nullptr)
327     {
328         fprintf(log, "%s\n", logMsg.c_str());
329     }
330     if (!warnMsg.empty() && warnToStdErr)
331     {
332         fprintf(stderr, "%s\n", warnMsg.c_str());
333     }
334
335     return (wanted == compiled);
336 }
337
338 /*! \endcond */
339
340 }  // namespace gmx