From 8bbf290c45eb338067550e076ee16dea43e1df12 Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Fri, 28 Sep 2018 14:19:22 +0200
Subject: [PATCH] Make AVX2 pass TSAN

TSAN builds complain about the LINCS code because it reads (and
ignores) one extra element per rvec when gathering rvecs on x86.

Change-Id: Id2da42e3dd2a1ac66b892c503b053b6a6c716a1a
---
 cmake/gmxManageSimd.cmake                     |  5 +++
 src/CMakeLists.txt                            |  1 +
 src/config.h.cmakein                          | 13 ++++++++
 src/gromacs/mdlib/lincs.cpp                   | 32 +++++++++++++++----
 .../impl_x86_avx2_256_util_double.h           | 32 ++++++++++++++++++-
 .../impl_x86_avx2_256_util_float.h            | 32 ++++++++++++++++++-
 .../impl_x86_avx_256_util_double.h            |  4 +++
 .../impl_x86_avx_256_util_float.h             |  4 +++
 8 files changed, 115 insertions(+), 8 deletions(-)
diff --git a/cmake/gmxManageSimd.cmake b/cmake/gmxManageSimd.cmake
index f7ffc9192d..5a6d91a0d8 100644
--- a/cmake/gmxManageSimd.cmake
+++ b/cmake/gmxManageSimd.cmake
@@ -404,5 +404,10 @@ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
     endif()
 endif()
 
+string(TOUPPER "${CMAKE_BUILD_TYPE}" _cmake_build_type)
+if (_cmake_build_type STREQUAL "TSAN" AND NOT (GMX_SIMD_ACTIVE STREQUAL "NONE" OR GMX_SIMD_ACTIVE STREQUAL "REFERENCE" OR GMX_SIMD_ACTIVE MATCHES "AVX_512" OR GMX_SIMD_ACTIVE STREQUAL AVX2_256))
+   message(WARNING "TSAN is only tested with SIMD None, Reference, AVX2_256, and AVX_512. It is known to detect (harmless) memory races with SSE and AVX.")
+endif()
+
 endmacro()
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index bd0dd1be9c..9949f21693 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -46,6 +46,7 @@ if(GMX_USE_CUDA)
     get_cuda_compiler_info(CUDA_COMPILER_INFO CUDA_COMPILER_FLAGS)
 endif()
 
+string(TOUPPER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_UPPER)
 configure_file(config.h.cmakein config.h)
 configure_file(gmxpre-config.h.cmakein gmxpre-config.h)
 configure_file(buildinfo.h.cmakein buildinfo.h ESCAPE_QUOTES)
diff --git a/src/config.h.cmakein b/src/config.h.cmakein
index 0f3aecbe57..e03dd3c4d4 100644
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -244,6 +244,19 @@
 /* Cluster size used by nonbonded OpenCL kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */
 #define GMX_OCL_NB_CLUSTER_SIZE @GMX_OCL_NB_CLUSTER_SIZE@
 
+/* Define constants for build types (starting at 1 to make sure undefined values don't match) */
+#define CMAKE_BUILD_TYPE_DEBUG 1
+#define CMAKE_BUILD_TYPE_RELEASE 2
+#define CMAKE_BUILD_TYPE_RELWITHDEBINFO 3
+#define CMAKE_BUILD_TYPE_MINSIZEREL 4
+#define CMAKE_BUILD_TYPE_REFERENCE 5
+#define CMAKE_BUILD_TYPE_RELWITHASSERT 6
+#define CMAKE_BUILD_TYPE_PROFILE 7
+#define CMAKE_BUILD_TYPE_TSAN 8
+#define CMAKE_BUILD_TYPE_ASAN 9
+#define CMAKE_BUILD_TYPE_MSAN 10
+#cmakedefine CMAKE_BUILD_TYPE CMAKE_BUILD_TYPE_@CMAKE_BUILD_TYPE_UPPER@
+
 /* Define relative path to OpenCL kernels */
 #define GMX_INSTALL_OCLDIR "@GMX_INSTALL_OCLDIR@"
 
diff --git a/src/gromacs/mdlib/lincs.cpp b/src/gromacs/mdlib/lincs.cpp
index 82deb3f665..fb3e2c0d79 100644
--- a/src/gromacs/mdlib/lincs.cpp
+++ b/src/gromacs/mdlib/lincs.cpp
@@ -756,6 +756,24 @@ static void do_lincsp(const rvec *x, rvec *f, rvec *fp, t_pbc *pbc,
 }
 
 #if GMX_SIMD_HAVE_REAL
+
+//! Helper function so that we can run TSAN with SIMD support (where implemented).
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUTransposeTSANSafe(const real         *base,
+                             const std::int32_t *offset,
+                             SimdReal           *v0,
+                             SimdReal           *v1,
+                             SimdReal           *v2)
+{
+#if (CMAKE_BUILD_TYPE == CMAKE_BUILD_TYPE_TSAN) && GMX_SIMD_X86_AVX2_256
+    // This function is only implemented in this case
+    gatherLoadUTransposeSafe<align>(base, offset, v0, v1, v2);
+#else
+    gatherLoadUTranspose<align>(base, offset, v0, v1, v2);
+#endif
+}
+
 /*! \brief Calculate the constraint distance vectors r to project on from x.
  *
  * Determine the right-hand side of the matrix equation using coordinates xp. */
@@ -795,8 +813,8 @@ calc_dr_x_xp_simd(int                       b0,
             offset1[i] = bla[bs*2 + i*2 + 1];
         }
 
-        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
-        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
+        gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
+        gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
         rx_S = x0_S - x1_S;
         ry_S = y0_S - y1_S;
         rz_S = z0_S - z1_S;
@@ -812,8 +830,9 @@ calc_dr_x_xp_simd(int                       b0,
 
         transposeScatterStoreU<3>(reinterpret_cast<real *>(r + bs), offset2, rx_S, ry_S, rz_S);
 
-        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(xp), offset0, &x0_S, &y0_S, &z0_S);
-        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(xp), offset1, &x1_S, &y1_S, &z1_S);
+        gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(xp), offset0, &x0_S, &y0_S, &z0_S);
+        gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(xp), offset1, &x1_S, &y1_S, &z1_S);
+
         rxp_S = x0_S - x1_S;
         ryp_S = y0_S - y1_S;
         rzp_S = z0_S - z1_S;
@@ -922,8 +941,9 @@ calc_dist_iter_simd(int                       b0,
             offset1[i] = bla[bs*2 + i*2 + 1];
         }
 
-        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
-        gatherLoadUTranspose<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
+        gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset0, &x0_S, &y0_S, &z0_S);
+        gatherLoadUTransposeTSANSafe<3>(reinterpret_cast<const real *>(x), offset1, &x1_S, &y1_S, &z1_S);
+
         rx_S = x0_S - x1_S;
         ry_S = y0_S - y1_S;
         rz_S = z0_S - z1_S;
diff --git a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h
index 20ef7c9417..0d9b3e5388 100644
--- a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h
+++ b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_double.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2018, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -36,8 +36,38 @@
 #ifndef GMX_SIMD_IMPL_X86_AVX2_256_UTIL_DOUBLE_H
 #define GMX_SIMD_IMPL_X86_AVX2_256_UTIL_DOUBLE_H
 
+#include "config.h"
+
 #include <immintrin.h>
 
 #include "gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h"
 
+namespace gmx
+{
+
+// This version is marginally slower than the AVX 4-wide component load
+// version on Intel Skylake. On older Intel architectures this version
+// is significantly slower.
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUTransposeSafe(const double *        base,
+                         const std::int32_t    offset[],
+                         SimdDouble *          v0,
+                         SimdDouble *          v1,
+                         SimdDouble *          v2)
+{
+    assert(std::size_t(offset) % 16 == 0);
+
+    const SimdDInt32 alignSimd = SimdDInt32(align);
+
+    SimdDInt32       vindex = simdLoad(offset, SimdDInt32Tag());
+    vindex = vindex*alignSimd;
+
+    *v0 = _mm256_i32gather_pd(base + 0, vindex.simdInternal_, sizeof(double));
+    *v1 = _mm256_i32gather_pd(base + 1, vindex.simdInternal_, sizeof(double));
+    *v2 = _mm256_i32gather_pd(base + 2, vindex.simdInternal_, sizeof(double));
+}
+
+}      //namespace gmx
+
 #endif // GMX_SIMD_IMPL_X86_AVX2_256_UTIL_DOUBLE_H
diff --git a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h
index 95fb0a4951..110b6d1675 100644
--- a/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h
+++ b/src/gromacs/simd/impl_x86_avx2_256/impl_x86_avx2_256_util_float.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2018, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -36,8 +36,38 @@
 #ifndef GMX_SIMD_IMPL_X86_AVX2_256_UTIL_FLOAT_H
 #define GMX_SIMD_IMPL_X86_AVX2_256_UTIL_FLOAT_H
 
+#include "config.h"
+
 #include <immintrin.h>
 
 #include "gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h"
 
+namespace gmx
+{
+
+// This version is marginally slower than the AVX 4-wide component load
+// version on Intel Skylake. On older Intel architectures this version
+// is significantly slower.
+template <int align>
+static inline void gmx_simdcall
+gatherLoadUTransposeSafe(const float *        base,
+                         const std::int32_t   offset[],
+                         SimdFloat *          v0,
+                         SimdFloat *          v1,
+                         SimdFloat *          v2)
+{
+    assert(std::size_t(offset) % 32 == 0);
+
+    const SimdFInt32 alignSimd = SimdFInt32(align);
+
+    SimdFInt32       vindex = simdLoad(offset, SimdFInt32Tag());
+    vindex = vindex*alignSimd;
+
+    *v0 = _mm256_i32gather_ps(base + 0, vindex.simdInternal_, sizeof(float));
+    *v1 = _mm256_i32gather_ps(base + 1, vindex.simdInternal_, sizeof(float));
+    *v2 = _mm256_i32gather_ps(base + 2, vindex.simdInternal_, sizeof(float));
+}
+
+}      // namespace gmx
+
 #endif // GMX_SIMD_IMPL_X86_AVX2_256_UTIL_FLOAT_H
diff --git a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h
index 241c612e12..522dc560ad 100644
--- a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h
+++ b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_double.h
@@ -115,6 +115,10 @@ gatherLoadTranspose(const double *        base,
 
 static const int c_simdBestPairAlignmentDouble = 2;
 
+// With the implementation below, thread-sanitizer can detect false positives.
+// For loading a triplet, we load 4 floats and ignore the last. Another thread
+// might write to this element, but that will not affect the result.
+// On AVX2 we can use a gather intrinsic instead.
 template <int align>
 static inline void gmx_simdcall
 gatherLoadUTranspose(const double *        base,
diff --git a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h
index f96fe28ce1..bb7f2c6116 100644
--- a/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h
+++ b/src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_util_float.h
@@ -157,6 +157,10 @@ gatherLoadTranspose(const float *        base,
 
 static const int c_simdBestPairAlignmentFloat = 2;
 
+// With the implementation below, thread-sanitizer can detect false positives.
+// For loading a triplet, we load 4 floats and ignore the last. Another thread
+// might write to this element, but that will not affect the result.
+// On AVX2 we can use a gather intrinsic instead.
 template <int align>
 static inline void gmx_simdcall
 gatherLoadUTranspose(const float *        base,
-- 
2.22.0