From 8bbf290c45eb338067550e076ee16dea43e1df12 Mon Sep 17 00:00:00 2001 From: Berk Hess Date: Fri, 28 Sep 2018 14:19:22 +0200 Subject: [PATCH] Make AVX2 pass TSAN TSAN builds complain about the LINCS code because it reads (and ignores) one extra element per rvec when gathering rvecs on x86. Change-Id: Id2da42e3dd2a1ac66b892c503b053b6a6c716a1a --- cmake/gmxManageSimd.cmake | 5 +++ src/CMakeLists.txt | 1 + src/config.h.cmakein | 13 ++++++++ src/gromacs/mdlib/lincs.cpp | 32 +++++++++++++++---- .../impl_x86_avx2_256_util_double.h | 32 ++++++++++++++++++- .../impl_x86_avx2_256_util_float.h | 32 ++++++++++++++++++- .../impl_x86_avx_256_util_double.h | 4 +++ .../impl_x86_avx_256_util_float.h | 4 +++ 8 files changed, 115 insertions(+), 8 deletions(-) diff --git a/cmake/gmxManageSimd.cmake b/cmake/gmxManageSimd.cmake index f7ffc9192d..5a6d91a0d8 100644 --- a/cmake/gmxManageSimd.cmake +++ b/cmake/gmxManageSimd.cmake @@ -404,5 +404,10 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") endif() endif() +string(TOUPPER "${CMAKE_BUILD_TYPE}" _cmake_build_type) +if (_cmake_build_type STREQUAL "TSAN" AND NOT (GMX_SIMD_ACTIVE STREQUAL "NONE" OR GMX_SIMD_ACTIVE STREQUAL "REFERENCE" OR GMX_SIMD_ACTIVE MATCHES "AVX_512" OR GMX_SIMD_ACTIVE STREQUAL AVX2_256)) + message(WARNING "TSAN is only tested with SIMD None, Reference, AVX2_256, and AVX_512. It is known to detect (harmless) memory races with SSE and AVX.") +endif() + endmacro() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bd0dd1be9c..9949f21693 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -46,6 +46,7 @@ if(GMX_USE_CUDA) get_cuda_compiler_info(CUDA_COMPILER_INFO CUDA_COMPILER_FLAGS) endif() +string(TOUPPER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_UPPER) configure_file(config.h.cmakein config.h) configure_file(gmxpre-config.h.cmakein gmxpre-config.h) configure_file(buildinfo.h.cmakein buildinfo.h ESCAPE_QUOTES) diff --git a/src/config.h.cmakein b/src/config.h.cmakein index 0f3aecbe57..e03dd3c4d4 100644 --- a/src/config.h.cmakein +++ b/src/config.h.cmakein @@ -244,6 +244,19 @@ /* Cluster size used by nonbonded OpenCL kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */ #define GMX_OCL_NB_CLUSTER_SIZE @GMX_OCL_NB_CLUSTER_SIZE@ +/* Define constants for build types (starting at 1 to make sure undefined values don't match) */ +#define CMAKE_BUILD_TYPE_DEBUG 1 +#define CMAKE_BUILD_TYPE_RELEASE 2 +#define CMAKE_BUILD_TYPE_RELWITHDEBINFO 3 +#define CMAKE_BUILD_TYPE_MINSIZEREL 4 +#define CMAKE_BUILD_TYPE_REFERENCE 5 +#define CMAKE_BUILD_TYPE_RELWITHASSERT 6 +#define CMAKE_BUILD_TYPE_PROFILE 7 +#define CMAKE_BUILD_TYPE_TSAN 8 +#define CMAKE_BUILD_TYPE_ASAN 9 +#define CMAKE_BUILD_TYPE_MSAN 10 +#cmakedefine CMAKE_BUILD_TYPE CMAKE_BUILD_TYPE_@CMAKE_BUILD_TYPE_UPPER@ + /* Define relative path to OpenCL kernels */ #define GMX_INSTALL_OCLDIR "@GMX_INSTALL_OCLDIR@" diff --git a/src/gromacs/mdlib/lincs.cpp b/src/gromacs/mdlib/lincs.cpp index 82deb3f665..fb3e2c0d79 100644 --- a/src/gromacs/mdlib/lincs.cpp +++ b/src/gromacs/mdlib/lincs.cpp @@ -756,6 +756,24 @@ static void do_lincsp(const rvec *x, rvec *f, rvec *fp, t_pbc *pbc, } #if GMX_SIMD_HAVE_REAL + +//! Helper function so that we can run TSAN with SIMD support (where implemented). +template +static inline void gmx_simdcall +gatherLoadUTransposeTSANSafe(const real *base, + const std::int32_t *offset, + SimdReal *v0, + SimdReal *v1, + SimdReal *v2) +{ +#if (CMAKE_BUILD_TYPE == CMAKE_BUILD_TYPE_TSAN) && GMX_SIMD_X86_AVX2_256 + // This function is only implemented in this case + gatherLoadUTransposeSafe(base, offset, v0, v1, v2); +#else + gatherLoadUTranspose(base, offset, v0, v1, v2); +#endif +} + /*! \brief Calculate the constraint distance vectors r to project on from x. * * Determine the right-hand side of the matrix equation using coordinates xp. */ @@ -795,8 +813,8 @@ calc_dr_x_xp_simd(int b0, offset1[i] = bla[bs*2 + i*2 + 1]; } - gatherLoadUTranspose<3>(reinterpret_cast(x), offset0, &x0_S, &y0_S, &z0_S); - gatherLoadUTranspose<3>(reinterpret_cast(x), offset1, &x1_S, &y1_S, &z1_S); + gatherLoadUTransposeTSANSafe<3>(reinterpret_cast(x), offset0, &x0_S, &y0_S, &z0_S); + gatherLoadUTransposeTSANSafe<3>(reinterpret_cast(x), offset1, &x1_S, &y1_S, &z1_S); rx_S = x0_S - x1_S; ry_S = y0_S - y1_S; rz_S = z0_S - z1_S; @@ -812,8 +830,9 @@ calc_dr_x_xp_simd(int b0, transposeScatterStoreU<3>(reinterpret_cast(r + bs), offset2, rx_S, ry_S, rz_S); - gatherLoadUTranspose<3>(reinterpret_cast(xp), offset0, &x0_S, &y0_S, &z0_S); - gatherLoadUTranspose<3>(reinterpret_cast(xp), offset1, &x1_S, &y1_S, &z1_S); + gatherLoadUTransposeTSANSafe<3>(reinterpret_cast(xp), offset0, &x0_S, &y0_S, &z0_S); + gatherLoadUTransposeTSANSafe<3>(reinterpret_cast(xp), offset1, &x1_S, &y1_S, &z1_S); + rxp_S = x0_S - x1_S; ryp_S = y0_S - y1_S; rzp_S = z0_S - z1_S; @@ -922,8 +941,9 @@ calc_dist_iter_simd(int b0, offset1[i] = bla[bs*2 + i*2 + 1]; } - gatherLoadUTranspose<3>(reinterpret_cast(x), offset0, &x0_S, &y0_S, &z0_S); - gatherLoadUTranspose<3>(reinterpret_cast(x), offset1, &x1_S, &y1_S, &z1_S); + gatherLoadUTransposeTSANSafe<3>(reinterpret_cast(x), offset0, &x0_S, &y0_S, &z0_S); + gatherLoadUTransposeTSANSafe<3>(reinterpret_cast(x), offset1, &x1_S, &y1_S, &z1_S); + rx_S = x0_S - x1_S; ry_S = y0_S - y1_S; rz_S = z0_S - z1_S; diff --git a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h index 20ef7c9417..0d9b3e5388 100644 --- a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h +++ b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2014,2015, by the GROMACS development team, led by + * Copyright (c) 2014,2015,2018, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -36,8 +36,38 @@ #ifndef GMX_SIMD_IMPL_X86_AVX2_256_UTIL_DOUBLE_H #define GMX_SIMD_IMPL_X86_AVX2_256_UTIL_DOUBLE_H +#include "config.h" + #include #include "gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h" +namespace gmx +{ + +// This version is marginally slower than the AVX 4-wide component load +// version on Intel Skylake. On older Intel architectures this version +// is significantly slower. +template +static inline void gmx_simdcall +gatherLoadUTransposeSafe(const double * base, + const std::int32_t offset[], + SimdDouble * v0, + SimdDouble * v1, + SimdDouble * v2) +{ + assert(std::size_t(offset) % 16 == 0); + + const SimdDInt32 alignSimd = SimdDInt32(align); + + SimdDInt32 vindex = simdLoad(offset, SimdDInt32Tag()); + vindex = vindex*alignSimd; + + *v0 = _mm256_i32gather_pd(base + 0, vindex.simdInternal_, sizeof(double)); + *v1 = _mm256_i32gather_pd(base + 1, vindex.simdInternal_, sizeof(double)); + *v2 = _mm256_i32gather_pd(base + 2, vindex.simdInternal_, sizeof(double)); +} + +} //namespace gmx + #endif // GMX_SIMD_IMPL_X86_AVX2_256_UTIL_DOUBLE_H diff --git a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h index 95fb0a4951..110b6d1675 100644 --- a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h +++ b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2014,2015, by the GROMACS development team, led by + * Copyright (c) 2014,2015,2018, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -36,8 +36,38 @@ #ifndef GMX_SIMD_IMPL_X86_AVX2_256_UTIL_FLOAT_H #define GMX_SIMD_IMPL_X86_AVX2_256_UTIL_FLOAT_H +#include "config.h" + #include #include "gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h" +namespace gmx +{ + +// This version is marginally slower than the AVX 4-wide component load +// version on Intel Skylake. On older Intel architectures this version +// is significantly slower. +template +static inline void gmx_simdcall +gatherLoadUTransposeSafe(const float * base, + const std::int32_t offset[], + SimdFloat * v0, + SimdFloat * v1, + SimdFloat * v2) +{ + assert(std::size_t(offset) % 32 == 0); + + const SimdFInt32 alignSimd = SimdFInt32(align); + + SimdFInt32 vindex = simdLoad(offset, SimdFInt32Tag()); + vindex = vindex*alignSimd; + + *v0 = _mm256_i32gather_ps(base + 0, vindex.simdInternal_, sizeof(float)); + *v1 = _mm256_i32gather_ps(base + 1, vindex.simdInternal_, sizeof(float)); + *v2 = _mm256_i32gather_ps(base + 2, vindex.simdInternal_, sizeof(float)); +} + +} // namespace gmx + #endif // GMX_SIMD_IMPL_X86_AVX2_256_UTIL_FLOAT_H diff --git a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h index 241c612e12..522dc560ad 100644 --- a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h +++ b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h @@ -115,6 +115,10 @@ gatherLoadTranspose(const double * base, static const int c_simdBestPairAlignmentDouble = 2; +// With the implementation below, thread-sanitizer can detect false positives. +// For loading a triplet, we load 4 floats and ignore the last. Another thread +// might write to this element, but that will not affect the result. +// On AVX2 we can use a gather intrinsic instead. template static inline void gmx_simdcall gatherLoadUTranspose(const double * base, diff --git a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h index f96fe28ce1..bb7f2c6116 100644 --- a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h +++ b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h @@ -157,6 +157,10 @@ gatherLoadTranspose(const float * base, static const int c_simdBestPairAlignmentFloat = 2; +// With the implementation below, thread-sanitizer can detect false positives. +// For loading a triplet, we load 4 floats and ignore the last. Another thread +// might write to this element, but that will not affect the result. +// On AVX2 we can use a gather intrinsic instead. template static inline void gmx_simdcall gatherLoadUTranspose(const float * base, -- 2.22.0