Make AVX2 pass TSAN

author Berk Hess <hess@kth.se>

Fri, 28 Sep 2018 12:19:22 +0000 (14:19 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Thu, 4 Oct 2018 17:38:41 +0000 (19:38 +0200)
author Berk Hess <hess@kth.se>
Fri, 28 Sep 2018 12:19:22 +0000 (14:19 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Thu, 4 Oct 2018 17:38:41 +0000 (19:38 +0200)
diff --git a/cmake/gmxManageSimd.cmake b/cmake/gmxManageSimd.cmake

index f7ffc9192d1bdf3b72a2a2fa122ba274a221e268..5a6d91a0d872ba520ffd28e37d18e71a66fc6eea 100644 (file)
--- a/cmake/gmxManageSimd.cmake
+++ b/cmake/gmxManageSimd.cmake
@@ -404,5 +404,10 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
      endif()
  endif()
  
+string(TOUPPER "${CMAKE_BUILD_TYPE}" _cmake_build_type)
+if (_cmake_build_type STREQUAL "TSAN" AND NOT (GMX_SIMD_ACTIVE STREQUAL "NONE" OR GMX_SIMD_ACTIVE STREQUAL "REFERENCE" OR GMX_SIMD_ACTIVE MATCHES "AVX_512" OR GMX_SIMD_ACTIVE STREQUAL AVX2_256))
+   message(WARNING "TSAN is only tested with SIMD None, Reference, AVX2_256, and AVX_512. It is known to detect (harmless) memory races with SSE and AVX.")
+endif()
+
  endmacro()
  
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt

index bd0dd1be9cb6fff168fa99413383cfbcc5c59f6e..9949f216934ceb7eca5a2047bbfe409cefaed00f 100644 (file)
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -46,6 +46,7 @@ if(GMX_USE_CUDA)
      get_cuda_compiler_info(CUDA_COMPILER_INFO CUDA_COMPILER_FLAGS)
  endif()
  
+string(TOUPPER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_UPPER)
  configure_file(config.h.cmakein config.h)
  configure_file(gmxpre-config.h.cmakein gmxpre-config.h)
  configure_file(buildinfo.h.cmakein buildinfo.h ESCAPE_QUOTES)
diff --git a/src/config.h.cmakein b/src/config.h.cmakein

index 0f3aecbe57007d6865d0772f4fa7d999d94b4e75..e03dd3c4d42b31359c9cd52ca42ea3b9b61caa52 100644 (file)
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -244,6 +244,19 @@
  /* Cluster size used by nonbonded OpenCL kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */
  #define GMX_OCL_NB_CLUSTER_SIZE @GMX_OCL_NB_CLUSTER_SIZE@
  
+/* Define constants for build types (starting at 1 to make sure undefined values don't match) */
+#define CMAKE_BUILD_TYPE_DEBUG 1
+#define CMAKE_BUILD_TYPE_RELEASE 2
+#define CMAKE_BUILD_TYPE_RELWITHDEBINFO 3
+#define CMAKE_BUILD_TYPE_MINSIZEREL 4
+#define CMAKE_BUILD_TYPE_REFERENCE 5
+#define CMAKE_BUILD_TYPE_RELWITHASSERT 6
+#define CMAKE_BUILD_TYPE_PROFILE 7
+#define CMAKE_BUILD_TYPE_TSAN 8
+#define CMAKE_BUILD_TYPE_ASAN 9
+#define CMAKE_BUILD_TYPE_MSAN 10
+#cmakedefine CMAKE_BUILD_TYPE CMAKE_BUILD_TYPE_@CMAKE_BUILD_TYPE_UPPER@
+
  /* Define relative path to OpenCL kernels */
  #define GMX_INSTALL_OCLDIR "@GMX_INSTALL_OCLDIR@"
  
diff --git a/src/gromacs/mdlib/lincs.cpp b/src/gromacs/mdlib/lincs.cpp

index 82deb3f66582163ae5c7138d489aa01d35c5f0cc..fb3e2c0d7970539f0b8f58472f5ab316118a33c1 100644 (file)
--- a/src/gromacs/mdlib/lincs.cpp
+++ b/src/gromacs/mdlib/lincs.cpp
@@ -756,6 +756,24 @@ static void do_lincsp(const rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
  }
  
  #if GMX_SIMD_HAVE_REAL
+
+//! Helper function so that we can run TSAN with SIMD support (where implemented).
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUTransposeTSANSafe(const real         *base,
+                             const std::int32_t *offset,
+                             SimdReal           *v0,
+                             SimdReal           *v1,
+                             SimdReal           *v2)
+{
+#if (CMAKE_BUILD_TYPE == CMAKE_BUILD_TYPE_TSAN) && GMX_SIMD_X86_AVX2_256
+    // This function is only implemented in this case
+    gatherLoadUTransposeSafe<align>(base, offset, v0, v1, v2);
+#else
+    gatherLoadUTranspose<align>(base, offset, v0, v1, v2);
+#endif
+}
+
  /*! \brief Calculate the constraint distance vectors r to project on from x.
   *
   * Determine the right-hand side of the matrix equation using coordinates xp. */
@@ -795,8 +813,8 @@ calc_dr_x_xp_simd(int                       b0,
              offset1[i] = bla[bs*2 + i*2 + 1];
          }
  
-        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
-        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
+        gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
+        gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
          rx_S = x0_S - x1_S;
          ry_S = y0_S - y1_S;
          rz_S = z0_S - z1_S;
@@ -812,8 +830,9 @@ calc_dr_x_xp_simd(int                       b0,
  
          transposeScatterStoreU<3>(reinterpret_cast<real *>(r + bs), offset2, rx_S, ry_S, rz_S);
  
-        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(xp), offset0, &x0_S, &y0_S, &z0_S);
-        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(xp), offset1, &x1_S, &y1_S, &z1_S);
+        gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(xp), offset0, &x0_S, &y0_S, &z0_S);
+        gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(xp), offset1, &x1_S, &y1_S, &z1_S);
+
          rxp_S = x0_S - x1_S;
          ryp_S = y0_S - y1_S;
          rzp_S = z0_S - z1_S;
@@ -922,8 +941,9 @@ calc_dist_iter_simd(int                       b0,
              offset1[i] = bla[bs*2 + i*2 + 1];
          }
  
-        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
-        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
+        gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
+        gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
+
          rx_S = x0_S - x1_S;
          ry_S = y0_S - y1_S;
          rz_S = z0_S - z1_S;
diff --git a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h

index 20ef7c941749b1fc01d464353be4c67b0b9410e8..0d9b3e5388bd4e85cbc89f7d6e928ae219ebb08e 100644 (file)
--- a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h
+++ b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -36,8 +36,38 @@
  #ifndef GMX_SIMD_IMPL_X86_AVX2_256_UTIL_DOUBLE_H
  #define GMX_SIMD_IMPL_X86_AVX2_256_UTIL_DOUBLE_H
  
+#include "config.h"
+
  #include <immintrin.h>
  
  #include "gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h"
  
+namespace gmx
+{
+
+// This version is marginally slower than the AVX 4-wide component load
+// version on Intel Skylake. On older Intel architectures this version
+// is significantly slower.
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUTransposeSafe(const double *        base,
+                         const std::int32_t    offset[],
+                         SimdDouble *          v0,
+                         SimdDouble *          v1,
+                         SimdDouble *          v2)
+{
+    assert(std::size_t(offset) % 16 == 0);
+
+    const SimdDInt32 alignSimd = SimdDInt32(align);
+
+    SimdDInt32       vindex = simdLoad(offset, SimdDInt32Tag());
+    vindex = vindex*alignSimd;
+
+    *v0 = _mm256_i32gather_pd(base + 0, vindex.simdInternal_, sizeof(double));
+    *v1 = _mm256_i32gather_pd(base + 1, vindex.simdInternal_, sizeof(double));
+    *v2 = _mm256_i32gather_pd(base + 2, vindex.simdInternal_, sizeof(double));
+}
+
+}      //namespace gmx
+
  #endif // GMX_SIMD_IMPL_X86_AVX2_256_UTIL_DOUBLE_H
diff --git a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h

index 95fb0a49514dbdc53df2ea8c526a40b333b5328e..110b6d1675f13e0accd3de8ae772006e468a1211 100644 (file)
--- a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h
+++ b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -36,8 +36,38 @@
  #ifndef GMX_SIMD_IMPL_X86_AVX2_256_UTIL_FLOAT_H
  #define GMX_SIMD_IMPL_X86_AVX2_256_UTIL_FLOAT_H
  
+#include "config.h"
+
  #include <immintrin.h>
  
  #include "gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h"
  
+namespace gmx
+{
+
+// This version is marginally slower than the AVX 4-wide component load
+// version on Intel Skylake. On older Intel architectures this version
+// is significantly slower.
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUTransposeSafe(const float *        base,
+                         const std::int32_t   offset[],
+                         SimdFloat *          v0,
+                         SimdFloat *          v1,
+                         SimdFloat *          v2)
+{
+    assert(std::size_t(offset) % 32 == 0);
+
+    const SimdFInt32 alignSimd = SimdFInt32(align);
+
+    SimdFInt32       vindex = simdLoad(offset, SimdFInt32Tag());
+    vindex = vindex*alignSimd;
+
+    *v0 = _mm256_i32gather_ps(base + 0, vindex.simdInternal_, sizeof(float));
+    *v1 = _mm256_i32gather_ps(base + 1, vindex.simdInternal_, sizeof(float));
+    *v2 = _mm256_i32gather_ps(base + 2, vindex.simdInternal_, sizeof(float));
+}
+
+}      // namespace gmx
+
  #endif // GMX_SIMD_IMPL_X86_AVX2_256_UTIL_FLOAT_H
diff --git a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h

index 241c612e12205a60aca4f6adadee61907a0a9227..522dc560ad2ee1bc6f1852950f01a212e84f7c87 100644 (file)
--- a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h
+++ b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h
@@ -115,6 +115,10 @@ gatherLoadTranspose(const double *        base,
  
  static const int c_simdBestPairAlignmentDouble = 2;
  
+// With the implementation below, thread-sanitizer can detect false positives.
+// For loading a triplet, we load 4 floats and ignore the last. Another thread
+// might write to this element, but that will not affect the result.
+// On AVX2 we can use a gather intrinsic instead.
  template <int align>
  static inline void gmx_simdcall
  gatherLoadUTranspose(const double *        base,
diff --git a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h

index f96fe28ce12f964391bd9a0352af08fcda32d2f6..bb7f2c6116e5829732d512d59a88e7fbf78f8483 100644 (file)
--- a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h
+++ b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h
@@ -157,6 +157,10 @@ gatherLoadTranspose(const float *        base,
  
  static const int c_simdBestPairAlignmentFloat = 2;
  
+// With the implementation below, thread-sanitizer can detect false positives.
+// For loading a triplet, we load 4 floats and ignore the last. Another thread
+// might write to this element, but that will not affect the result.
+// On AVX2 we can use a gather intrinsic instead.
  template <int align>
  static inline void gmx_simdcall
  gatherLoadUTranspose(const float *        base,
author	Berk Hess <hess@kth.se>
	Fri, 28 Sep 2018 12:19:22 +0000 (14:19 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 4 Oct 2018 17:38:41 +0000 (19:38 +0200)
cmake/gmxManageSimd.cmake		patch \| blob \| history
src/CMakeLists.txt		patch \| blob \| history
src/config.h.cmakein		patch \| blob \| history
src/gromacs/mdlib/lincs.cpp		patch \| blob \| history
src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h		patch \| blob \| history
src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h		patch \| blob \| history
src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h		patch \| blob \| history
src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h		patch \| blob \| history