--- /dev/null
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2017, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+include(gmxSimdFlags)
+
+# gmx_detect_avx_512_fma_units()
+#
+# Try to detect whether the host has one or two AVX-512 FMA units
+# by executing a small program. This will only work on hosts that
+# support AVX-512. If successful it sets RESULT to 1 or 2 for the
+# number of AVX-512 FMA units, and otherwise -1.
+#
+function(gmx_detect_avx_512_fma_units RESULT)
+ if(CMAKE_CROSSCOMPILING)
+ set(${RESULT} -1 CACHE INTERNAL "Result of test for number of AVX-512 FMA units")
+ else()
+ set(AVX_512_FMA_UNIT_DETECTION_BINARY "${PROJECT_BINARY_DIR}/CMakeFiles/GmxDetectAvx512FmaUnits${CMAKE_EXECUTABLE_SUFFIX}")
+ if(NOT AVX_512_FMA_UNIT_DETECTION_COMPILED)
+
+ # Find flags required for AVX-512
+ gmx_find_simd_avx_512_flags(SIMD_AVX_512_C_SUPPORTED SIMD_AVX_512_CXX_SUPPORTED
+ SIMD_AVX_512_C_FLAGS SIMD_AVX_512_CXX_FLAGS)
+
+ if(${SIMD_AVX_512_CXX_SUPPORTED})
+ # Compile the detection program
+
+ set(_compile_definitions "-I${PROJECT_SOURCE_DIR}/src -DGMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE ${SIMD_AVX_512_CXX_FLAGS} ${GMX_STDLIB_CXX_FLAGS}")
+ try_compile(AVX_512_FMA_UNIT_DETECTION_COMPILED
+ "${PROJECT_BINARY_DIR}"
+ "${PROJECT_SOURCE_DIR}/src/gromacs/hardware/identifyavx512fmaunits.cpp"
+ COMPILE_DEFINITIONS "${_compile_definitions}"
+ LINK_LIBRARIES "${GMX_STDLIB_LIBRARIES}"
+ OUTPUT_VARIABLE AVX_512_FMA_UNIT_DETECTION_COMPILED_OUTPUT
+ COPY_FILE ${AVX_512_FMA_UNIT_DETECTION_BINARY})
+ if(NOT AVX_512_FMA_UNIT_DETECTION_COMPILED AND NOT RUN_AVX_512_FMA_UNIT_DETECTION_COMPILATION_QUIETLY)
+ message(STATUS "Could not identify number of AVX-512 units - detection program did not compile")
+ endif()
+ set(RUN_AVX_512_FMA_UNIT_DETECTION_COMPILATION_QUIETLY TRUE CACHE INTERNAL "Keep quiet on any future compilation attempts")
+ endif()
+
+ if(AVX_512_FMA_UNIT_DETECTION_COMPILED)
+ # Run the program
+ if(NOT DEFINED ${RESULT})
+ execute_process(COMMAND ${AVX_512_FMA_UNIT_DETECTION_BINARY}
+ RESULT_VARIABLE RESULT_VAR
+ OUTPUT_VARIABLE OUTPUT_VAR_TEMP
+ ERROR_QUIET)
+ if (RESULT_VAR EQUAL 0)
+ string(STRIP "${OUTPUT_VAR_TEMP}" OUTPUT_VAR)
+ set(${RESULT} ${OUTPUT_VAR_TEMP} CACHE INTERNAL "Result of test for number of AVX-512 FMA units")
+ else()
+ message(STATUS "Could not identify number of AVX-512 units - detection program did run successfully")
+ set(${RESULT} -1 CACHE INTERNAL "Result of test for number of AVX-512 FMA units")
+ endif()
+ endif()
+ endif()
+ endif()
+ endif()
+endfunction()
gmx_detect_target_architecture()
include(gmxDetectCpu)
+include(gmxDetectAvx512FmaUnits)
+
function(gmx_suggest_simd _suggested_simd)
if (NOT SUGGEST_SIMD_QUIETLY)
message(STATUS "Detecting best SIMD instructions for this CPU")
if(CPU_DETECTION_FEATURES MATCHES " avx512er ")
set(OUTPUT_SIMD "AVX_512_KNL")
elseif(CPU_DETECTION_FEATURES MATCHES " avx512f ")
- set(OUTPUT_SIMD "AVX_512")
+ gmx_detect_avx_512_fma_units(NUMBER_OF_AVX_512_FMA_UNITS)
+ if(NUMBER_OF_AVX_512_FMA_UNITS EQUAL 2)
+ set(OUTPUT_SIMD "AVX_512")
+ elseif(NUMBER_OF_AVX_512_FMA_UNITS EQUAL 1)
+ message(STATUS "This host supports AVX-512, but only has 1 AVX-512 FMA unit, so AVX2 will be faster.")
+ set(OUTPUT_SIMD "AVX2_256")
+ else()
+ message(WARNING "Could not run code to detect number of AVX-512 FMA units - assuming 2.")
+ set(OUTPUT_SIMD "AVX_512")
+ endif()
elseif(CPU_DETECTION_FEATURES MATCHES " avx2 ")
if(CPU_DETECTION_FEATURES MATCHES " amd ")
set(OUTPUT_SIMD "AVX2_128")
message(STATUS "${SIMD_STATUS_MESSAGE}")
endif()
+# While AVX-512 is a more recent SIMD ISA than AVX2, some Intel CPUs only have
+# a single AVX-512 FMA unit, but two AVX2 FMA units, and then it is better to
+# use AVX2. The only way to test this is to execute a small timing loop.
+# To be able to recommend the user whether s/he should try AVX-512 instead of
+# AVX2, we need to compile a single file with AVX512 flags. We do this
+# automatically, but this option provides a way to turn it off in case it
+# breaks something. The actual test source file is built if
+# SIMD_AVX_512_CXX_SUPPORTED is set, so it will always be included if we have
+# GMX_SIMD=AVX_512.
+set(GMX_ENABLE_AVX512_TESTS ON CACHE INTERNAL "Compile AVX512 code to test FMA units, even when not using AVX512 SIMD")
+mark_as_advanced(GMX_ENABLE_AVX512_TESTS)
+
+if(GMX_ENABLE_AVX512_TESTS AND
+ (GMX_SIMD_ACTIVE STREQUAL "AVX_256" OR GMX_SIMD_ACTIVE STREQUAL "AVX2_256" OR GMX_SIMD_ACTIVE STREQUAL "AVX2_128"))
+ gmx_find_simd_avx_512_flags(SIMD_AVX_512_C_SUPPORTED SIMD_AVX_512_CXX_SUPPORTED
+ SIMD_AVX_512_C_FLAGS SIMD_AVX_512_CXX_FLAGS)
+endif()
+
# By default, 32-bit windows cannot pass SIMD (SSE/AVX) arguments in registers,
# and even on 64-bit (all platforms) it is only used for a handful of arguments.
# The __vectorcall (MSVC, from MSVC2013) or __regcall (ICC) calling conventions
"-msse2" "/arch:SSE2" "-hgnu")
if(${SIMD_SSE2_C_FLAGS_RESULT})
- set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_SSE2_C_FLAGS}" PARENT_SCOPE)
+ set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_SSE2_C_FLAGS}" CACHE INTERNAL "C flags required for SSE2 instructions")
endif()
if(${SIMD_SSE2_CXX_FLAGS_RESULT})
- set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_SSE2_CXX_FLAGS}" PARENT_SCOPE)
+ set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_SSE2_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for SSE2 instructions")
endif()
set(${C_FLAGS_RESULT} ${SIMD_SSE2_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE2 C flags" FORCE)
- set(${CXX_FLAGS_RESULT} ${SIMD_SSE2_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE2 CXX flags" FORCE)
+ set(${CXX_FLAGS_RESULT} ${SIMD_SSE2_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE2 C++ flags" FORCE)
endfunction()
# SSE4.1
"-msse4.1" "/arch:SSE4.1" "/arch:SSE2" "-hgnu")
if(${SIMD_SSE4_1_C_FLAGS_RESULT})
- set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_SSE4_1_C_FLAGS}" PARENT_SCOPE)
+ set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_SSE4_1_C_FLAGS}" CACHE INTERNAL "C flags required for SSE4.1 instructions")
endif()
if(${SIMD_SSE4_1_CXX_FLAGS_RESULT})
- set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_SSE4_1_CXX_FLAGS}" PARENT_SCOPE)
+ set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_SSE4_1_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for SSE4.1 instructions")
endif()
set(${C_FLAGS_RESULT} ${SIMD_SSE4_1_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE4.1 C flags" FORCE)
- set(${CXX_FLAGS_RESULT} ${SIMD_SSE4_1_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE4.1 CXX flags" FORCE)
+ set(${CXX_FLAGS_RESULT} ${SIMD_SSE4_1_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for SSE4.1 C++ flags" FORCE)
endfunction()
# AVX, but using only 128-bit instructions and FMA (AMD XOP processors)
endif()
if(${SIMD_AVX_128_FMA_C_FLAGS_RESULT})
- set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_GENERIC_C_FLAGS} ${SIMD_AVX_AMD_FMA_C_FLAGS} ${SIMD_AVX_XOP_C_FLAGS}" PARENT_SCOPE)
+ set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_GENERIC_C_FLAGS} ${SIMD_AVX_AMD_FMA_C_FLAGS} ${SIMD_AVX_XOP_C_FLAGS}" CACHE INTERNAL "C flags required for 128-bit AVX with AMD FMA instructions")
endif()
if(${SIMD_AVX_128_FMA_CXX_FLAGS_RESULT})
- set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_GENERIC_CXX_FLAGS} ${SIMD_AVX_AMD_FMA_CXX_FLAGS} ${SIMD_AVX_XOP_CXX_FLAGS}" PARENT_SCOPE)
+ set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_GENERIC_CXX_FLAGS} ${SIMD_AVX_AMD_FMA_CXX_FLAGS} ${SIMD_AVX_XOP_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for 128-bit AVX with AMD FMA instructions")
endif()
set(${C_FLAGS_RESULT} ${SIMD_AVX_128_FMA_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for 128-bit AVX with AMD FMA C flags" FORCE)
- set(${CXX_FLAGS_RESULT} ${SIMD_AVX_128_FMA_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for 128-bit AVX with AMD FMA CXX flags" FORCE)
+ set(${CXX_FLAGS_RESULT} ${SIMD_AVX_128_FMA_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for 128-bit AVX with AMD FMA C++ flags" FORCE)
endfunction()
"-mavx" "/arch:AVX" "-hgnu")
if(${SIMD_AVX_C_FLAGS_RESULT})
- set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_C_FLAGS}" PARENT_SCOPE)
+ set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_C_FLAGS}" CACHE INTERNAL "C flags required for AVX instructions")
endif()
if(${SIMD_AVX_CXX_FLAGS_RESULT})
- set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_CXX_FLAGS}" PARENT_SCOPE)
+ set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for AVX instructions")
endif()
set(${C_FLAGS_RESULT} ${SIMD_AVX_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX C flags" FORCE)
- set(${CXX_FLAGS_RESULT} ${SIMD_AVX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX CXX flags" FORCE)
+ set(${CXX_FLAGS_RESULT} ${SIMD_AVX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX C++ flags" FORCE)
endfunction()
# AVX2
"-march=core-avx2" "-mavx2" "/arch:AVX" "-hgnu") # no AVX2-specific flag for MSVC yet
if(${SIMD_AVX2_C_FLAGS_RESULT})
- set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX2_C_FLAGS}" PARENT_SCOPE)
+ set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX2_C_FLAGS}" CACHE INTERNAL "C flags required for AVX2 instructions")
endif()
if(${SIMD_AVX2_CXX_FLAGS_RESULT})
- set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX2_CXX_FLAGS}" PARENT_SCOPE)
+ set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX2_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for AVX2 instructions")
endif()
set(${C_FLAGS_RESULT} ${SIMD_AVX2_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX2 C flags" FORCE)
- set(${CXX_FLAGS_RESULT} ${SIMD_AVX2_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX2 CXX flags" FORCE)
+ set(${CXX_FLAGS_RESULT} ${SIMD_AVX2_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX2 C++ flags" FORCE)
endfunction()
"-xCORE-AVX512 -qopt-zmm-usage=high" "-xCORE-AVX512" "-mavx512f -mfma" "-mavx512f" "/arch:AVX" "-hgnu") # no AVX_512F flags known for MSVC yet. ICC should use ZMM if code anyhow uses ZMM
if(${SIMD_AVX_512_C_FLAGS_RESULT})
- set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_512_C_FLAGS}" PARENT_SCOPE)
+ set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_512_C_FLAGS}" CACHE INTERNAL "C flags required for AVX-512 instructions")
endif()
if(${SIMD_AVX_512_CXX_FLAGS_RESULT})
- set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_512_CXX_FLAGS}" PARENT_SCOPE)
+ set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_512_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for AVX-512 instructions")
endif()
set(${C_FLAGS_RESULT} ${SIMD_AVX_512_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 C flags" FORCE)
- set(${CXX_FLAGS_RESULT} ${SIMD_AVX_512_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 CXX flags" FORCE)
+ set(${CXX_FLAGS_RESULT} ${SIMD_AVX_512_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 C++ flags" FORCE)
endfunction()
"-xMIC-AVX512" "-mavx512er -mfma" "-mavx512er" "/arch:AVX" "-hgnu") # no AVX_512ER flags known for MSVC yet
if(${SIMD_AVX_512_KNL_C_FLAGS_RESULT})
- set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_512_KNL_C_FLAGS}" PARENT_SCOPE)
+ set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_AVX_512_KNL_C_FLAGS}" CACHE INTERNAL "C flags required for AVX-512 for KNL instructions")
endif()
if(${SIMD_AVX_512_KNL_CXX_FLAGS_RESULT})
- set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_512_KNL_CXX_FLAGS}" PARENT_SCOPE)
+ set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_AVX_512_KNL_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for AVX-512 for KNL instructions")
endif()
set(${C_FLAGS_RESULT} ${SIMD_AVX_512_KNL_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 for KNL C flags" FORCE)
- set(${CXX_FLAGS_RESULT} ${SIMD_AVX_512_KNL_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 for KNL CXX flags" FORCE)
+ set(${CXX_FLAGS_RESULT} ${SIMD_AVX_512_KNL_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for AVX-512 for KNL C++ flags" FORCE)
endfunction()
"-mfpu=neon-vfpv4" "-mfpu=neon" "")
if(${SIMD_ARM_NEON_C_FLAGS_RESULT})
- set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_ARM_NEON_C_FLAGS}" PARENT_SCOPE)
+ set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_ARM_NEON_C_FLAGS}" CACHE INTERNAL "C flags required for Arm Neon instructions")
endif()
if(${SIMD_ARM_NEON_CXX_FLAGS_RESULT})
- set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_ARM_NEON_CXX_FLAGS}" PARENT_SCOPE)
+ set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_ARM_NEON_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for Arm Neon instructions")
endif()
set(${C_FLAGS_RESULT} ${SIMD_ARM_NEON_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon C flags" FORCE)
- set(${CXX_FLAGS_RESULT} ${SIMD_ARM_NEON_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon CXX flags" FORCE)
+ set(${CXX_FLAGS_RESULT} ${SIMD_ARM_NEON_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon C++ flags" FORCE)
endfunction()
# Arm Neon Asimd (64-bit ARM)
"")
if(${SIMD_ARM_NEON_ASIMD_C_FLAGS_RESULT})
- set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_ARM_NEON_ASIMD_C_FLAGS}" PARENT_SCOPE)
+ set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_ARM_NEON_ASIMD_C_FLAGS}" CACHE INTERNAL "C flags required for Arm Neon Asimd instructions")
endif()
if(${SIMD_ARM_NEON_ASIMD_CXX_FLAGS_RESULT})
- set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_ARM_NEON_ASIMD_CXX_FLAGS}" PARENT_SCOPE)
+ set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_ARM_NEON_ASIMD_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for Arm Neon Asimd instructions")
endif()
set(${C_FLAGS_RESULT} ${SIMD_ARM_NEON_ASIMD_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon Asimd C flags" FORCE)
- set(${CXX_FLAGS_RESULT} ${SIMD_ARM_NEON_ASIMD_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon Asimd CXX flags" FORCE)
+ set(${CXX_FLAGS_RESULT} ${SIMD_ARM_NEON_ASIMD_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for Arm Neon Asimd C++ flags" FORCE)
endfunction()
# IBM VMX (power6)
"-maltivec -mabi=altivec" "-qarch=auto -qaltivec")
if(${SIMD_IBM_VMX_C_FLAGS_RESULT})
- set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_IBM_VMX_C_FLAGS}" PARENT_SCOPE)
+ set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_IBM_VMX_C_FLAGS}" CACHE INTERNAL "C flags required for IBM VMX instructions")
endif()
if(${SIMD_IBM_VMX_CXX_FLAGS_RESULT})
- set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_IBM_VMX_CXX_FLAGS}" PARENT_SCOPE)
+ set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_IBM_VMX_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for IBM VMX instructions")
endif()
set(${C_FLAGS_RESULT} ${SIMD_IBM_VMX_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VMX C flags" FORCE)
- set(${CXX_FLAGS_RESULT} ${SIMD_IBM_VMX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VMX CXX flags" FORCE)
+ set(${CXX_FLAGS_RESULT} ${SIMD_IBM_VMX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VMX C++ flags" FORCE)
endfunction()
# IBM VSX (power7 and later)
"-mvsx" "-maltivec -mabi=altivec" "-qarch=auto -qaltivec")
if(${SIMD_IBM_VSX_C_FLAGS_RESULT})
- set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_IBM_VSX_C_FLAGS}" PARENT_SCOPE)
+ set(${C_FLAGS_VARIABLE} "${TOOLCHAIN_C_FLAGS} ${SIMD_IBM_VSX_C_FLAGS}" CACHE INTERNAL "C flags required for IBM VSX instructions")
endif()
if(${SIMD_IBM_VSX_CXX_FLAGS_RESULT})
- set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_IBM_VSX_CXX_FLAGS}" PARENT_SCOPE)
+ set(${CXX_FLAGS_VARIABLE} "${TOOLCHAIN_CXX_FLAGS} ${SIMD_IBM_VSX_CXX_FLAGS}" CACHE INTERNAL "C++ flags required for IBM VSX instructions")
endif()
set(${C_FLAGS_RESULT} ${SIMD_IBM_VSX_C_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VSX C flags" FORCE)
- set(${CXX_FLAGS_RESULT} ${SIMD_IBM_VSX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VSX CXX flags" FORCE)
+ set(${CXX_FLAGS_RESULT} ${SIMD_IBM_VSX_CXX_FLAGS_RESULT} CACHE INTERNAL "Result of test for IBM VSX C++ flags" FORCE)
endfunction()
/* Target mantissa accuracy for SIMD double precision math */
#define GMX_SIMD_ACCURACY_BITS_DOUBLE @GMX_SIMD_ACCURACY_BITS_DOUBLE@
+/* Enable code that requires AVX-512 instruction support, without GMX_SIMD=AVX_512 */
+#cmakedefine01 SIMD_AVX_512_CXX_SUPPORTED
+
/* Whether a double-precision configuration may target accuracy equivalent to single precision */
#cmakedefine01 GMX_RELAXED_DOUBLE_PRECISION
set_property(GLOBAL PROPERTY GMX_LIBGROMACS_SOURCES)
set_property(GLOBAL PROPERTY GMX_INSTALLED_HEADERS)
+set_property(GLOBAL PROPERTY GMX_AVX_512_SOURCE)
function (_gmx_add_files_to_property PROPERTY)
foreach (_file ${ARGN})
endif()
set_source_files_properties(selection/scanner.cpp PROPERTIES COMPILE_FLAGS "${_scanner_cpp_compiler_flags}")
+if(SIMD_AVX_512_CXX_SUPPORTED)
+ set_source_files_properties(hardware/identifyavx512fmaunits.cpp PROPERTIES COMPILE_FLAGS "${SIMD_AVX_512_CXX_FLAGS}")
+endif()
+
gmx_setup_tng_for_libgromacs()
target_link_libraries(libgromacs
gpu_hw_info.cpp
hardwaretopology.cpp
printhardware.cpp
+ identifyavx512fmaunits.cpp
)
if (BUILD_TESTING)
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements a routine to check the number of AVX512 fma units
+ *
+ * Just as the CpuInfo code, we need to be able to compile this file in stand-alone mode
+ * to set the SIMD acceleration and similar things during CMake configuration.
+ */
+
+#ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
+#include "gmxpre.h"
+#endif
+
+#include "identifyavx512fmaunits.h"
+
+#ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
+#define SIMD_AVX_512_CXX_SUPPORTED 1
+#else
+#include "config.h"
+#endif
+
+#if SIMD_AVX_512_CXX_SUPPORTED
+#include <immintrin.h>
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+#endif // SIMD_AVX_512_CXX_SUPPORTED
+
+#include <cstdint>
+#include <cstdio>
+
+#include <algorithm>
+#include <mutex>
+
+#ifndef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
+#include "gromacs/hardware/cpuinfo.h"
+#endif
+
+namespace gmx
+{
+
+namespace
+{
+
+#if SIMD_AVX_512_CXX_SUPPORTED
+// Use a local routine to read the timestep counter just on x86 to avoid dependence
+// on the Gromacs cycle counter module.
+uint64_t rdtscp(void)
+{
+#ifdef MSC_VER
+ unsigned int ui;
+ return static_cast<uint64_t>(__rdtscp(&ui));
+#else
+ uint32_t low;
+ uint32_t high;
+
+ __asm__ __volatile__("rdtscp" : "=a" (low), "=d" (high) :: "ecx" );
+ return (static_cast<uint64_t>(high) << 32) | low;
+#endif
+}
+
+/*\ brief Loop over mixed FMA and shuffle AVX512 instructions
+ *
+ * This function executes a meaningless loop that includes both
+ * FMA and shuffle instructions from the AVX512 instruction set.
+ * We need a bit of complex logic to make sure it cannot be
+ * optimized away by the compiler.
+ *
+ * \param loopCount Number of iterations. Each iteration will
+ * execute 12 FMA and 12 shuffle instructions.
+ * \param seed A double-precision number between 0 and 1.
+ * To be really certain the loop is not optimized
+ * away, you should use some timing-related
+ * function to create this seed at runtime.
+ * \return Meaningless floating-point number. Make sure you
+ * add this number to some variable and conditionally
+ * issue a print statement e.g. if it is negative
+ * (which should not happen), again to make sure the loop
+ * cannot be optimized away.
+ */
+double
+executeFmaAndShuffleLoop(int loopCount,
+ double seed)
+{
+ // Make sure all variables are different to avoid gcc optimizing them away
+ __m512d d0 = _mm512_set1_pd(1.0-0.01*seed);
+ __m512d d1 = _mm512_set1_pd(1.0-0.02*seed);
+ __m512d d2 = _mm512_set1_pd(1.0-0.03*seed);
+ __m512d d3 = _mm512_set1_pd(1.0-0.04*seed);
+ __m512d d4 = _mm512_set1_pd(1.0-0.05*seed);
+ __m512d d5 = _mm512_set1_pd(1.0-0.06*seed);
+ __m512d d6 = _mm512_set1_pd(1.0-0.07*seed);
+ __m512d d7 = _mm512_set1_pd(1.0-0.08*seed);
+ __m512d d8 = _mm512_set1_pd(1.0-0.09*seed);
+ __m512d d9 = _mm512_set1_pd(1.0-0.10*seed);
+ __m512d d10 = _mm512_set1_pd(1.0-0.11*seed);
+ __m512d d11 = _mm512_set1_pd(1.0-0.12*seed);
+ __m512d eps = _mm512_set1_pd(1e-6);
+ __m512i i0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+ __m512i i1 = _mm512_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
+ __m512i i2 = _mm512_set_epi32(1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2);
+ __m512i i3 = _mm512_set_epi32(2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3);
+ __m512i i4 = _mm512_set_epi32(3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4);
+ __m512i i5 = _mm512_set_epi32(4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5);
+ __m512i i6 = _mm512_set_epi32(5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6);
+ __m512i i7 = _mm512_set_epi32(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+ __m512i i8 = _mm512_set_epi32(8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9);
+ __m512i i9 = _mm512_set_epi32(9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10);
+ __m512i i10 = _mm512_set_epi32(10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11);
+ __m512i i11 = _mm512_set_epi32(11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12);
+ __m512i idx = _mm512_set_epi32(12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13);
+ __mmask16 mask = static_cast<uint16_t>(0xffff);
+
+ for (int i = 0; i < loopCount; i++)
+ {
+ d0 = _mm512_fmadd_pd(d0, d0, eps);
+ d1 = _mm512_fmadd_pd(d1, d1, eps);
+ d2 = _mm512_fmadd_pd(d2, d2, eps);
+ d3 = _mm512_fmadd_pd(d3, d3, eps);
+ d4 = _mm512_fmadd_pd(d4, d4, eps);
+ d5 = _mm512_fmadd_pd(d5, d5, eps);
+ d6 = _mm512_fmadd_pd(d6, d6, eps);
+ d7 = _mm512_fmadd_pd(d7, d7, eps);
+ d8 = _mm512_fmadd_pd(d8, d8, eps);
+ d9 = _mm512_fmadd_pd(d9, d9, eps);
+ d10 = _mm512_fmadd_pd(d10, d10, eps);
+ d11 = _mm512_fmadd_pd(d11, d11, eps);
+ // plain permutevar is not yet available in gcc-6.4
+ i0 = _mm512_maskz_permutexvar_epi32(mask, idx, i0);
+ i1 = _mm512_maskz_permutexvar_epi32(mask, idx, i1);
+ i2 = _mm512_maskz_permutexvar_epi32(mask, idx, i2);
+ i3 = _mm512_maskz_permutexvar_epi32(mask, idx, i3);
+ i4 = _mm512_maskz_permutexvar_epi32(mask, idx, i4);
+ i5 = _mm512_maskz_permutexvar_epi32(mask, idx, i5);
+ i6 = _mm512_maskz_permutexvar_epi32(mask, idx, i6);
+ i7 = _mm512_maskz_permutexvar_epi32(mask, idx, i7);
+ i8 = _mm512_maskz_permutexvar_epi32(mask, idx, i8);
+ i9 = _mm512_maskz_permutexvar_epi32(mask, idx, i9);
+ i10 = _mm512_maskz_permutexvar_epi32(mask, idx, i10);
+ i11 = _mm512_maskz_permutexvar_epi32(mask, idx, i11);
+ }
+
+ // Make sure we use all variables in the loop to return a result
+ i0 = _mm512_add_epi32(i0, i1);
+ i2 = _mm512_add_epi32(i2, i3);
+ i4 = _mm512_add_epi32(i4, i5);
+ i6 = _mm512_add_epi32(i6, i7);
+ i8 = _mm512_add_epi32(i8, i9);
+ i10 = _mm512_add_epi32(i10, i11);
+ i0 = _mm512_add_epi32(i0, i2);
+ i4 = _mm512_add_epi32(i4, i6);
+ i8 = _mm512_add_epi32(i8, i10);
+ i0 = _mm512_add_epi32(i0, i4);
+ i0 = _mm512_add_epi32(i0, i8);
+
+ d0 = _mm512_fmadd_pd(d0, d1, d2);
+ d3 = _mm512_fmadd_pd(d3, d4, d5);
+ d6 = _mm512_fmadd_pd(d6, d7, d8);
+ d9 = _mm512_fmadd_pd(d9, d10, d11);
+ d0 = _mm512_add_pd(d0, d3);
+ d6 = _mm512_add_pd(d6, d9);
+ d0 = _mm512_add_pd(d0, d6);
+
+ double data[8];
+ int idata[16];
+ _mm512_storeu_pd(data, d0);
+ _mm512_storeu_si512(idata, i0);
+
+ double d = 0;
+
+ for (int i = 0; i < 8; i++)
+ {
+ d += data[i] * idata[2*i] * idata[2*i+1];
+ }
+
+ return d;
+}
+
+/*\ brief Loop over FMA AVX512 instructions
+ *
+ * This function executes a meaningless loop that includes only
+ * FMA instructions from the AVX512 instruction set.
+ * We need a bit of complex logic to make sure it cannot be
+ * optimized away by the compiler.
+ *
+ * \param loopCount Number of iterations. Each iteration will
+ * execute 12 FMA instructions.
+ * \param seed A double-precision number between 0 and 1.
+ * To be really certain the loop is not optimized
+ * away, you should use some timing-related
+ * function to create this seed at runtime.
+ * \return Meaningless floating-point number. Make sure you
+ * add this number to some variable and conditionally
+ * issue a print statement e.g. if it is negative
+ * (which should not happen), again to make sure the loop
+ * cannot be optimized away.
+ */
+double
+executeFmaOnlyLoop(int loopCount,
+ double seed)
+{
+ // Make sure all variables are different to avoid gcc optimizing them away
+ __m512d d0 = _mm512_set1_pd(1.0-0.01*seed);
+ __m512d d1 = _mm512_set1_pd(1.0-0.02*seed);
+ __m512d d2 = _mm512_set1_pd(1.0-0.03*seed);
+ __m512d d3 = _mm512_set1_pd(1.0-0.04*seed);
+ __m512d d4 = _mm512_set1_pd(1.0-0.05*seed);
+ __m512d d5 = _mm512_set1_pd(1.0-0.06*seed);
+ __m512d d6 = _mm512_set1_pd(1.0-0.07*seed);
+ __m512d d7 = _mm512_set1_pd(1.0-0.08*seed);
+ __m512d d8 = _mm512_set1_pd(1.0-0.09*seed);
+ __m512d d9 = _mm512_set1_pd(1.0-0.10*seed);
+ __m512d d10 = _mm512_set1_pd(1.0-0.11*seed);
+ __m512d d11 = _mm512_set1_pd(1.0-0.12*seed);
+ __m512d eps = _mm512_set1_pd(1e-6);
+
+ for (int i = 0; i < loopCount; i++)
+ {
+ d0 = _mm512_fmadd_pd(d0, d0, eps);
+ d1 = _mm512_fmadd_pd(d1, d1, eps);
+ d2 = _mm512_fmadd_pd(d2, d2, eps);
+ d3 = _mm512_fmadd_pd(d3, d3, eps);
+ d4 = _mm512_fmadd_pd(d4, d4, eps);
+ d5 = _mm512_fmadd_pd(d5, d5, eps);
+ d6 = _mm512_fmadd_pd(d6, d6, eps);
+ d7 = _mm512_fmadd_pd(d7, d7, eps);
+ d8 = _mm512_fmadd_pd(d8, d8, eps);
+ d9 = _mm512_fmadd_pd(d9, d9, eps);
+ d10 = _mm512_fmadd_pd(d10, d10, eps);
+ d11 = _mm512_fmadd_pd(d11, d11, eps);
+ }
+
+ // Make sure we use all variables in the loop to return a result
+ d0 = _mm512_fmadd_pd(d0, d1, d2);
+ d3 = _mm512_fmadd_pd(d3, d4, d5);
+ d6 = _mm512_fmadd_pd(d6, d7, d8);
+ d9 = _mm512_fmadd_pd(d9, d10, d11);
+ d0 = _mm512_add_pd(d0, d3);
+ d6 = _mm512_add_pd(d6, d9);
+ d0 = _mm512_add_pd(d0, d6);
+
+ double data[8];
+
+ _mm512_storeu_pd(data, d0);
+
+ double d = 0;
+
+ for (int i = 0; i < 8; i++)
+ {
+ d += data[i];
+ }
+ return d;
+}
+
+int
+checkDualAvx512FmaUnits()
+{
+ uint64_t timeFmaAndShuf = 1e9; // Large value
+ uint64_t timeFmaOnly = 1e9; // Large value
+ double dummy;
+ double seed = (rdtscp() & 0xff) / 256.0; // Create an unpredictable small number between 0 and 1
+
+ // Make sure the CPU is in AVX512 mode by executing a fairly long loop
+ dummy = executeFmaOnlyLoop(100000, seed);
+
+ // Execute the mixed FMA/shuffle loop three times
+ for (int i = 0; i < 3; i++)
+ {
+ uint64_t start = rdtscp();
+ dummy += executeFmaAndShuffleLoop(1000, seed);
+ uint64_t res = rdtscp() - start;
+ timeFmaAndShuf = std::min(timeFmaAndShuf, res);
+ }
+
+ // Execute the FMA-only loop three times
+ for (int i = 0; i < 3; i++)
+ {
+ uint64_t start = rdtscp();
+ dummy += executeFmaOnlyLoop(1000, seed);
+ uint64_t res = rdtscp() - start;
+ timeFmaOnly = std::min(timeFmaOnly, res);
+ }
+
+ // Dummy can never be negative, but by using it in the
+ // conditional it cannot be optimized away.
+ return (timeFmaAndShuf > 1.5 * timeFmaOnly || dummy < 0);
+}
+
+
+#endif // SIMD_AVX_512_CXX_SUPPORTED
+
+/*! \brief Mutex to guard the execution of the timing test
+ *
+ * We only execute the test once, and return the saved result
+ * on subsequent calls.
+ */
+std::mutex initMutex;
+
+} // namespace anonymous
+
+int
+identifyAvx512FmaUnits()
+{
+ static bool initialized = false;
+ static int result = false;
+
+ if (!initialized)
+ {
+ std::lock_guard<std::mutex> lock(initMutex);
+
+ if (!initialized)
+ {
+ // For the standalone test binary we assume it will
+ // only be executed on AVX512 hardware, but for the
+ // library version we check the hardware support.
+#ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
+ bool haveAvx512Hardware = true;
+#else
+ bool haveAvx512Hardware = CpuInfo::detect().feature(CpuInfo::Feature::X86_Avx512F);
+#endif
+
+ if (haveAvx512Hardware)
+ {
+#if SIMD_AVX_512_CXX_SUPPORTED
+ result = checkDualAvx512FmaUnits() ? 2 : 1;
+#else
+ result = -1; // Cannot run the tests
+#endif
+ }
+ else
+ {
+ result = 0; // Not AVX-512 hardware
+ }
+ initialized = true;
+ }
+ }
+ return result;
+}
+
+} // namespace gmx
+
+#ifdef GMX_IDENTIFY_AVX512_FMA_UNITS_STANDALONE
+int
+main()
+{
+ printf("%d\n", gmx::identifyAvx512FmaUnits());
+ return 0;
+}
+#endif
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2017, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \libinternal \file
+ * \brief Defines a routine to check the number of AVX512 fma units
+ *
+ * \author Erik Lindahl <erik.lindahl@gmail.com>
+ * \inlibraryapi
+ * \ingroup module_hardware
+ */
+
+namespace gmx
+{
+
+/*! \brief Test whether machine has dual AVX512 FMA units
+ *
+ * \return 1 or 2 for the number of AVX512 FMA units if AVX512
+ * support is present, 0 if we know the hardware does
+ * not have AVX512 support, or -1 if the test cannot
+ * run because the compiler lacked AVX512 support.
+ */
+int
+identifyAvx512FmaUnits();
+
+} // namespace gmx
#include "gromacs/hardware/cpuinfo.h"
#include "gromacs/hardware/hardwaretopology.h"
#include "gromacs/hardware/hw_info.h"
+#include "gromacs/hardware/identifyavx512fmaunits.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/simd/support.h"
#include "gromacs/utility/basedefinitions.h"
s += gmx::formatString("\n");
}
- s += gmx::formatString(" SIMD instructions most likely to fit this hardware: %s",
- gmx::simdString(static_cast<gmx::SimdType>(hwinfo->simd_suggest_min)).c_str());
-
- if (hwinfo->simd_suggest_max > hwinfo->simd_suggest_min)
+ if (cpuInfo.feature(gmx::CpuInfo::Feature::X86_Avx512F))
{
- s += gmx::formatString(" - %s", gmx::simdString(static_cast<gmx::SimdType>(hwinfo->simd_suggest_max)).c_str());
+ int avx512fmaunits = gmx::identifyAvx512FmaUnits();
+ s += gmx::formatString(" Number of AVX-512 FMA units:");
+ if (avx512fmaunits > 0)
+ {
+ s += gmx::formatString(" %d", avx512fmaunits);
+ if (avx512fmaunits == 1)
+ {
+ s += gmx::formatString(" (AVX2 is faster w/o 2 AVX-512 FMA units)");
+ }
+ }
+ else
+ {
+ s += gmx::formatString(" Cannot run AVX-512 detection - assuming 2");
+ }
+ s += gmx::formatString("\n");
}
- s += gmx::formatString("\n");
-
- s += gmx::formatString(" SIMD instructions selected at GROMACS compile time: %s\n",
- gmx::simdString(gmx::simdCompiled()).c_str());
-
- s += gmx::formatString("\n");
s += gmx::formatString(" Hardware topology: ");
switch (hwTop.supportLevel())
fprintf(fplog, "%s\n", detected.c_str());
}
- if (MULTIMASTER(cr))
- {
- std::string detected;
-
- detected = detected_hardware_string(hwinfo, FALSE);
-
- fprintf(stderr, "%s\n", detected.c_str());
- }
+ // Do not spam stderr with all our internal information unless
+ // there was something that actually went wrong; general information
+ // belongs in the logfile.
/* Check the compiled SIMD instruction set against that of the node
* with the lowest SIMD level support (skip if SIMD detection did not work)
#include <string>
#include "gromacs/hardware/cpuinfo.h"
+#include "gromacs/hardware/identifyavx512fmaunits.h"
+#include "gromacs/utility/stringutil.h"
namespace gmx
{
}
else if (c.feature(CpuInfo::Feature::X86_Avx512F))
{
- suggested = SimdType::X86_Avx512;
+ // If we could not identify the number of AVX512 FMA units we assume 2
+ suggested = ( identifyAvx512FmaUnits() == 1 ) ? SimdType::X86_Avx2 : SimdType::X86_Avx512;
}
else if (c.feature(CpuInfo::Feature::X86_Avx2))
{
FILE * log,
bool warnToStdErr)
{
- SimdType compiled = simdCompiled();
+ SimdType compiled = simdCompiled();
- // Normally it is close to catastrophic if the compiled SIMD type is larger than
- // the supported one, but AVX128Fma is an exception: AMD CPUs will (strongly) prefer
- // AVX128Fma, but they will work fine with AVX too. Thus, make an exception for this.
- if (compiled > wanted && !(compiled == SimdType::X86_Avx && wanted == SimdType::X86_Avx128Fma))
+ gmx::TextLineWrapper wrapper;
+ std::string logMsg;
+ std::string warnMsg;
+
+ wrapper.settings().setLineLength(78);
+
+ if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx512)
{
- fprintf(stderr, "Warning: SIMD instructions newer than hardware. Program will likely crash.\n"
- "SIMD instructions most likely to fit this hardware: %s\n"
- "SIMD instructions selected at GROMACS compile time: %s\n\n",
- simdString(wanted).c_str(),
- simdString(compiled).c_str());
+ logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
+ "SIMD instructions selected at compile time: %s\n"
+ "This program was compiled for different hardware than you are running on, "
+ "which could influence performance. This build might have been configured on "
+ "a login node with only a single AVX-512 FMA unit (in which case AVX2 is faster), "
+ "while the node you are running on has dual AVX-512 FMA units.",
+ simdString(wanted).c_str(), simdString(compiled).c_str()));
+ warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
+ simdString(compiled).c_str(), simdString(wanted).c_str()));
+ }
+ else if (compiled == SimdType::X86_Avx512 && wanted == SimdType::X86_Avx2 && identifyAvx512FmaUnits() == 1)
+ {
+ // The reason for explicitly checking the number of FMA units above is to avoid triggering
+ // this conditional if the AVX2 SIMD was requested by some other node in a heterogeneous MPI run.
+ logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
+ "SIMD instructions selected at compile time: %s\n"
+ "This program was compiled for different hardware than you are running on, "
+ "which could influence performance."
+ "This host supports AVX-512, but since it only has 1 AVX-512"
+ "FMA unit, it would be faster to use AVX2 instead.",
+ simdString(wanted).c_str(), simdString(compiled).c_str()));
+ warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
+ simdString(compiled).c_str(), simdString(wanted).c_str()));
+ }
+ else if (compiled == SimdType::X86_Avx2 && wanted == SimdType::X86_Avx2_128)
+ {
+ logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
+ "SIMD instructions selected at compile time: %s\n"
+ "This program was compiled for different hardware than you are running on, "
+ "which could influence performance."
+ "Ryzen/Threadripper CPUs support 256-bit AVX2, but 128-bit is faster.",
+ simdString(wanted).c_str(), simdString(compiled).c_str()));
+ warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
+ simdString(compiled).c_str(), simdString(wanted).c_str()));
+ }
+ else if (compiled > wanted && !(compiled == SimdType::X86_Avx && wanted == SimdType::X86_Avx128Fma))
+ {
+ // Normally it is close to catastrophic if the compiled SIMD type is larger than
+ // the supported one, but AVX128Fma is an exception: AMD CPUs will (strongly) prefer
+ // AVX128Fma, but they will work fine with AVX too. Thus, make an exception for this.
+ logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
+ "SIMD instructions selected at compile time: %s\n"
+ "Compiled SIMD newer than requested; program might crash.",
+ simdString(wanted).c_str(), simdString(compiled).c_str()));
+ warnMsg = logMsg;
}
else if (wanted != compiled)
{
// This warning will also occur if compiled is X86_Avx and wanted is X86_Avx128Fma
+ logMsg = wrapper.wrapToString(formatString("Highest SIMD level requested by all nodes in run: %s\n"
+ "SIMD instructions selected at compile time: %s\n"
+ "This program was compiled for different hardware than you are running on, "
+ "which could influence performance.",
+ simdString(wanted).c_str(), simdString(compiled).c_str()));
+ warnMsg = wrapper.wrapToString(formatString("Compiled SIMD: %s, but for this host/run %s might be better (see log).",
+ simdString(compiled).c_str(), simdString(wanted).c_str()));
+ }
- if (log != nullptr)
- {
- fprintf(log, "\nBinary not matching hardware - you might be losing performance.\n"
- "SIMD instructions most likely to fit this hardware: %s\n"
- "SIMD instructions selected at GROMACS compile time: %s\n\n",
- simdString(wanted).c_str(),
- simdString(compiled).c_str());
- }
- if (warnToStdErr)
- {
- fprintf(stderr, "Compiled SIMD instructions: %s, GROMACS could use %s on this machine, which is better.\n\n",
- simdString(compiled).c_str(),
- simdString(wanted).c_str());
- }
+ if (log != nullptr)
+ {
+ fprintf(log, "%s\n", logMsg.c_str());
+ }
+ if (warnToStdErr)
+ {
+ fprintf(stderr, "%s\n", warnMsg.c_str());
}
+
return (wanted == compiled);
}