Add cross-correlation as density simlarity measure
[alexxy/gromacs.git] / src / gromacs / simd / support.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36 /*! \internal \file
37  *
38  * \brief Implements SIMD architecture support query routines
39  *
40  * \author Erik Lindahl <erik.lindahl@scilifelab.se>
41  *
42  * \ingroup module_simd
43  */
44
45 #include "gmxpre.h"
46
47 #include "support.h"
48
49 #include "config.h"
50
51 #include <cstdio>
52 #include <cstdlib>
53
54 #include <map>
55 #include <string>
56
57 #include "gromacs/hardware/cpuinfo.h"
58 #include "gromacs/hardware/identifyavx512fmaunits.h"
59 #include "gromacs/utility/stringutil.h"
60
61 namespace gmx
62 {
63
64 /*! \cond libapi */
65
66 const std::string &
67 simdString(SimdType s)
68 {
69     static const std::map<SimdType, std::string> name =
70     {
71         { SimdType::None,           "None"            },
72         { SimdType::Reference,      "Reference"       },
73         { SimdType::Generic,        "Generic"         },
74         { SimdType::X86_Sse2,       "SSE2"            },
75         { SimdType::X86_Sse4_1,     "SSE4.1"          },
76         { SimdType::X86_Avx128Fma,  "AVX_128_FMA"     },
77         { SimdType::X86_Avx,        "AVX_256"         },
78         { SimdType::X86_Avx2,       "AVX2_256"        },
79         { SimdType::X86_Avx2_128,   "AVX2_128"        },
80         { SimdType::X86_Avx512,     "AVX_512"         },
81         { SimdType::X86_Avx512Knl,  "AVX_512_KNL"     },
82         { SimdType::X86_Mic,        "X86_MIC"         },
83         { SimdType::Arm_Neon,       "ARM_NEON"        },
84         { SimdType::Arm_NeonAsimd,  "ARM_NEON_ASIMD"  },
85         { SimdType::Ibm_Vmx,        "IBM_VMX"         },
86         { SimdType::Ibm_Vsx,        "IBM_VSX"         },
87         { SimdType::Fujitsu_HpcAce, "Fujitsu HPC-ACE" }
88     };
89
90     return name.at(s);
91 }
92
93 SimdType
94 simdSuggested(const CpuInfo &c)
95 {
96     SimdType suggested = SimdType::None;
97
98     if (c.supportLevel() >= CpuInfo::SupportLevel::Features)
99     {
100         switch (c.vendor())
101         {
102             case CpuInfo::Vendor::Intel:
103                 if (c.feature(CpuInfo::Feature::X86_Avx512ER))
104                 {
105                     suggested = SimdType::X86_Avx512Knl;
106                 }
107                 else if (c.feature(CpuInfo::Feature::X86_Avx512F))
108                 {
109                     // If we could not identify the number of AVX512 FMA units we assume 2
110                     suggested = ( identifyAvx512FmaUnits() == 1 ) ? SimdType::X86_Avx2 : SimdType::X86_Avx512;
111                 }
112                 else if (c.feature(CpuInfo::Feature::X86_Avx2))
113                 {
114                     suggested = SimdType::X86_Avx2;
115                 }
116                 else if (c.feature(CpuInfo::Feature::X86_Avx))
117                 {
118                     suggested = SimdType::X86_Avx;
119                 }
120                 else if (c.feature(CpuInfo::Feature::X86_Sse4_1))
121                 {
122                     suggested = SimdType::X86_Sse4_1;
123                 }
124                 else if (c.feature(CpuInfo::Feature::X86_Sse2))
125                 {
126                     suggested = SimdType::X86_Sse2;
127                 }
128                 break;
129             case CpuInfo::Vendor::Amd:
130             case CpuInfo::Vendor::Hygon:
131                 if (c.feature(CpuInfo::Feature::X86_Avx2))
132                 {
133                     // AMD Ryzen supports 256-bit AVX2, but performs better with 128-bit
134                     // since it can execute two independent such instructions per cycle,
135                     // and wider SIMD has slightly lower efficiency in GROMACS.
136                     suggested = SimdType::X86_Avx2_128;
137                 }
138                 else if (c.feature(CpuInfo::Feature::X86_Avx))
139                 {
140                     // Use 128-bit FMA SIMD if Fma4 flag is set, otherwise plain 256-bit AVX
141                     if (c.feature(CpuInfo::Feature::X86_Fma4))
142                     {
143                         suggested = SimdType::X86_Avx128Fma;
144                     }
145                     else
146                     {
147                         suggested = SimdType::X86_Avx;
148                     }
149                 }
150                 else if (c.feature(CpuInfo::Feature::X86_Sse4_1))
151                 {
152                     suggested = SimdType::X86_Sse4_1;
153                 }
154                 else if (c.feature(CpuInfo::Feature::X86_Sse2))
155                 {
156                     suggested = SimdType::X86_Sse2;
157                 }
158
159                 break;
160             case CpuInfo::Vendor::Arm:
161                 if (c.feature(CpuInfo::Feature::Arm_NeonAsimd))
162                 {
163                     suggested = SimdType::Arm_NeonAsimd;
164                 }
165                 else if (c.feature(CpuInfo::Feature::Arm_Neon))
166                 {
167                     suggested = SimdType::Arm_Neon;
168                 }
169                 break;
170             case CpuInfo::Vendor::Ibm:
171                 if (c.feature(CpuInfo::Feature::Ibm_Vsx))
172                 {
173                     suggested = SimdType::Ibm_Vsx;
174                 }
175                 else if (c.feature(CpuInfo::Feature::Ibm_Vmx))
176                 {
177                     suggested = SimdType::Ibm_Vmx;
178                 }
179                 break;
180             case CpuInfo::Vendor::Fujitsu:
181                 if (c.feature(CpuInfo::Feature::Fujitsu_HpcAce))
182                 {
183                     suggested = SimdType::Fujitsu_HpcAce;
184                 }
185                 break;
186             default:
187                 break;
188         }
189     }
190     return suggested;
191 }
192
193 SimdType
194 simdCompiled()
195 {
196 #if GMX_SIMD_X86_AVX_512_KNL
197     return SimdType::X86_Avx512Knl;
198 #elif GMX_SIMD_X86_AVX_512
199     return SimdType::X86_Avx512;
200 #elif GMX_SIMD_X86_MIC
201     return SimdType::X86_Mic;
202 #elif GMX_SIMD_X86_AVX2_256
203     return SimdType::X86_Avx2;
204 #elif GMX_SIMD_X86_AVX2_128
205     return SimdType::X86_Avx2_128;
206 #elif GMX_SIMD_X86_AVX_256
207     return SimdType::X86_Avx;
208 #elif GMX_SIMD_X86_AVX_128_FMA
209     return SimdType::X86_Avx128Fma;
210 #elif GMX_SIMD_X86_SSE4_1
211     return SimdType::X86_Sse4_1;
212 #elif GMX_SIMD_X86_SSE2
213     return SimdType::X86_Sse2;
214 #elif GMX_SIMD_ARM_NEON
215     return SimdType::Arm_Neon;
216 #elif GMX_SIMD_ARM_NEON_ASIMD
217     return SimdType::Arm_NeonAsimd;
218 #elif GMX_SIMD_IBM_VMX
219     return SimdType::Ibm_Vmx;
220 #elif GMX_SIMD_IBM_VSX
221     return SimdType::Ibm_Vsx;
222 #elif GMX_SIMD_SPARC64_HPC_ACE
223     return SimdType::Fujitsu_HpcAce;
224 #elif GMX_SIMD_REFERENCE
225     return SimdType::Reference;
226 #else
227     return SimdType::None;
228 #endif
229 }
230
231 bool
232 simdCheck(gmx::SimdType    wanted,
233           FILE *           log,
234           bool             warnToStdErr)
235 {
236     SimdType             compiled  = simdCompiled();
237
238     gmx::TextLineWrapper wrapper;
239     std::string          logMsg;
240     std::string          warnMsg;
241
242     wrapper.settings().setLineLength(78);
243
244     if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx512)
245     {
246         logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
247                                                    "SIMD instructions selected at compile time:       %s\n"
248                                                    "This program was compiled for different hardware than you are running on, "
249                                                    "which could influence performance. This build might have been configured on "
250                                                    "a login node with only a single AVX-512 FMA unit (in which case AVX2 is faster), "
251                                                    "while the node you are running on has dual AVX-512 FMA units.",
252                                                    simdString(wanted).c_str(), simdString(compiled).c_str()));
253         warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
254                                                     simdString(compiled).c_str(), simdString(wanted).c_str()));
255     }
256     else if (compiled == SimdType::X86_Avx512 && wanted == SimdType::X86_Avx2 && identifyAvx512FmaUnits() == 1)
257     {
258         // The reason for explicitly checking the number of FMA units above is to avoid triggering
259         // this conditional if the AVX2 SIMD was requested by some other node in a heterogeneous MPI run.
260         logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
261                                                    "SIMD instructions selected at compile time:       %s\n"
262                                                    "This program was compiled for different hardware than you are running on, "
263                                                    "which could influence performance."
264                                                    "This host supports AVX-512, but since it only has 1 AVX-512"
265                                                    "FMA unit, it would be faster to use AVX2 instead.",
266                                                    simdString(wanted).c_str(), simdString(compiled).c_str()));
267         warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
268                                                     simdString(compiled).c_str(), simdString(wanted).c_str()));
269     }
270     else if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx2_128)
271     {
272         // Wanted SimdType::X86_Avx2_128 can only be the AMD Zen architecture.
273         // AVX2_256 is only up to a few percent slower than AVX2_128
274         // in both single and double precision. AVX2_256 is slightly
275         // faster with nonbondeds and PME on a GPU. Don't warn the user.
276     }
277     else if (compiled > wanted && !(compiled == SimdType::X86_Avx && wanted == SimdType::X86_Avx128Fma))
278     {
279         // Normally it is close to catastrophic if the compiled SIMD type is larger than
280         // the supported one, but AVX128Fma is an exception: AMD CPUs will (strongly) prefer
281         // AVX128Fma, but they will work fine with AVX too. Thus, make an exception for this.
282         logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
283                                                    "SIMD instructions selected at compile time:       %s\n"
284                                                    "Compiled SIMD newer than requested; program might crash.",
285                                                    simdString(wanted).c_str(), simdString(compiled).c_str()));
286         warnMsg = logMsg;
287     }
288     else if (wanted != compiled)
289     {
290         // This warning will also occur if compiled is X86_Avx and wanted is X86_Avx128Fma
291         logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
292                                                    "SIMD instructions selected at compile time:       %s\n"
293                                                    "This program was compiled for different hardware than you are running on, "
294                                                    "which could influence performance.",
295                                                    simdString(wanted).c_str(), simdString(compiled).c_str()));
296         warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
297                                                     simdString(compiled).c_str(), simdString(wanted).c_str()));
298     }
299
300     if (!logMsg.empty() && log != nullptr)
301     {
302         fprintf(log, "%s\n", logMsg.c_str());
303     }
304     if (!warnMsg.empty() && warnToStdErr)
305     {
306         fprintf(stderr, "%s\n", warnMsg.c_str());
307     }
308
309     return (wanted == compiled);
310 }
311
312 /*! \endcond */
313
314 }  // namespace gmx