From 80721741a5adb17d4c61235c627792aaeb89cd31 Mon Sep 17 00:00:00 2001
From: Erik Lindahl <erik@kth.se>
Date: Thu, 20 Dec 2012 19:18:04 +0100
Subject: [PATCH] Added work-around for gcc bug in AVX intrinsincs formal
 parameter

Many relatively recent gcc versions (4.5.2, possibly 4.6.0) use incorrect
formal parameters for maskload and maskstore intrinsics. This patch
detects the bug during configuration and works around it with a define.
Refs #1058, and fixes it at least from gcc-4.5.2.

Change-Id: I447a968d2153f5acf59ad170ace69ad1b6b3f24d
---
 CMakeLists.txt                                |  7 ++
 cmake/TestAVXMaskload.c                       | 17 +++++
 cmake/gmxTestAVXMaskload.cmake                | 72 +++++++++++++++++++
 include/gmx_x86_avx_128_fma.h                 | 13 ++++
 include/gmx_x86_avx_256.h                     | 12 ++++
 src/config.h.cmakein                          |  3 +
 .../kernelutil_x86_avx_128_fma_single.h       |  8 +--
 .../kernelutil_x86_avx_256_single.h           | 56 +++++++--------
 8 files changed, 156 insertions(+), 32 deletions(-)
 create mode 100644 cmake/TestAVXMaskload.c
 create mode 100644 cmake/gmxTestAVXMaskload.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 97c022f5cf..c778d56c8c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -692,6 +692,9 @@ if(NOT GMX_SYSTEM_XDR)
     set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_INTERNAL_XDR")
 endif(NOT GMX_SYSTEM_XDR)
 
+# include avx test source, used if the AVX flags are set below
+include(gmxTestAVXMaskload)
+
 # Process nonbonded accelerated kernels settings
 string(TOUPPER ${GMX_CPU_ACCELERATION} ${GMX_CPU_ACCELERATION})
 if(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
@@ -851,6 +854,10 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
         endif()
     endif()
 
+    # Unfortunately gcc-4.5.2 and gcc-4.6.0 has a bug where they use the wrong datatype for the formal
+    # parameter of the mask for maskload/maskstore arguments. Check if this is present, since we can work around it.
+    gmx_test_avx_gcc_maskload_bug(${ACCELERATION_C_FLAGS} GMX_X86_AVX_GCC_MASKLOAD_BUG)
+
 elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
 # GMX_CPU_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
     if (NOT ACCELERATION_QUIETLY)
diff --git a/cmake/TestAVXMaskload.c b/cmake/TestAVXMaskload.c
new file mode 100644
index 0000000000..61777b077f
--- /dev/null
+++ b/cmake/TestAVXMaskload.c
@@ -0,0 +1,17 @@
+#include<immintrin.h>
+int main()
+{
+    __m256d a;
+    __m256i mask;
+    double  d[4]={1,2,3,4};
+
+    a = _mm256_setzero_pd();
+    mask = _mm256_castpd_si256(a);
+
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+    a = _mm256_maskload_pd(d,_mm256_castsi256_pd(mask));
+#else
+    a = _mm256_maskload_pd(d,mask);
+#endif
+}
+
diff --git a/cmake/gmxTestAVXMaskload.cmake b/cmake/gmxTestAVXMaskload.cmake
new file mode 100644
index 0000000000..a80920dfc2
--- /dev/null
+++ b/cmake/gmxTestAVXMaskload.cmake
@@ -0,0 +1,72 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+#
+#  GMX_TEST_AVX_GCC_MASKLOAD_BUG(VARIABLE)
+#
+#  VARIABLE will be set if the compiler is a buggy version
+#  of GCC (prior to 4.5.3, and maybe 4.6) that has an incorrect second
+#  argument to the AVX _mm256_maskload_ps() intrinsic.
+#
+#  You need to use this variable in a cmakedefine, and then handle
+#  the case separately in your code - no automatic cure, unfortunately.
+#
+MACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG AVX_CFLAGS VARIABLE)
+    IF(NOT DEFINED ${VARIABLE})
+        MESSAGE(STATUS "Checking for gcc AVX maskload bug") 
+        # some compilers like clang accept both cases, 
+        # so first try a normal compile to avoid flagging those as buggy.
+        TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
+                    "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
+                    COMPILE_DEFINITIONS "${AVX_CFLAGS}" )
+        IF(${VARIABLE}_COMPILEOK)
+            SET(${VARIABLE} 0 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
+            MESSAGE(STATUS "Checking for gcc AVX maskload bug - not present")
+        ELSE()
+            TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
+                        "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
+                         COMPILE_DEFINITIONS "${AVX_CFLAGS} -DGMX_X86_AVX_GCC_MASKLOAD_BUG" )
+            IF(${VARIABLE}_COMPILEOK)
+                SET(${VARIABLE} 1 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
+                MESSAGE(STATUS "Checking for gcc AVX maskload bug - found, will try to work around")
+            ELSE()
+                MESSAGE(WARNING "Cannot compile AVX code - assuming gcc AVX maskload bug not present." )
+                MESSAGE(STATUS "Checking for gcc AVX maskload bug - not present")
+            ENDIF()
+        ENDIF()
+    ENDIF(NOT DEFINED ${VARIABLE})
+ENDMACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG VARIABLE)
+
+
+
+
diff --git a/include/gmx_x86_avx_128_fma.h b/include/gmx_x86_avx_128_fma.h
index 1669da94be..c704073677 100644
--- a/include/gmx_x86_avx_128_fma.h
+++ b/include/gmx_x86_avx_128_fma.h
@@ -206,6 +206,19 @@ static int gmx_mm_check_and_reset_overflow(void)
     return sse_overflow;
 }
 
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+#    define gmx_mm_maskload_ps(mem,mask)       _mm_maskload_ps((mem),_mm_castsi128_ps(mask))
+#    define gmx_mm_maskstore_ps(mem,mask,x)    _mm_maskstore_ps((mem),_mm_castsi128_ps(mask),(x))
+#    define gmx_mm256_maskload_ps(mem,mask)    _mm256_maskload_ps((mem),_mm256_castsi256_ps(mask))
+#    define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),_mm256_castsi256_ps(mask),(x))
+#else
+#    define gmx_mm_maskload_ps(mem,mask)       _mm_maskload_ps((mem),(mask))
+#    define gmx_mm_maskstore_ps(mem,mask,x)    _mm_maskstore_ps((mem),(mask),(x))
+#    define gmx_mm256_maskload_ps(mem,mask)    _mm256_maskload_ps((mem),(mask))
+#    define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),(mask),(x))
+#endif
+
 
 
 #endif /* _gmx_x86_avx_128_fma_h_ */
diff --git a/include/gmx_x86_avx_256.h b/include/gmx_x86_avx_256.h
index 9f266e834f..461c736283 100644
--- a/include/gmx_x86_avx_256.h
+++ b/include/gmx_x86_avx_256.h
@@ -286,6 +286,18 @@ static int gmx_mm_check_and_reset_overflow(void)
     return sse_overflow;
 }
 
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+#    define gmx_mm_maskload_ps(mem,mask)       _mm_maskload_ps((mem),_mm_castsi128_ps(mask))
+#    define gmx_mm_maskstore_ps(mem,mask,x)    _mm_maskstore_ps((mem),_mm_castsi128_ps(mask),(x))
+#    define gmx_mm256_maskload_ps(mem,mask)    _mm256_maskload_ps((mem),_mm256_castsi256_ps(mask))
+#    define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),_mm256_castsi256_ps(mask),(x))
+#else
+#    define gmx_mm_maskload_ps(mem,mask)       _mm_maskload_ps((mem),(mask))
+#    define gmx_mm_maskstore_ps(mem,mask,x)    _mm_maskstore_ps((mem),(mask),(x))
+#    define gmx_mm256_maskload_ps(mem,mask)    _mm256_maskload_ps((mem),(mask))
+#    define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),(mask),(x))
+#endif
 
 
 #endif /* _gmx_x86_avx_256_h_ */
diff --git a/src/config.h.cmakein b/src/config.h.cmakein
index aec0becbbc..fa77489709 100644
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -106,6 +106,9 @@
 /* AVX 256-bit instructions available */
 #cmakedefine GMX_X86_AVX_256
 
+/* GCC bug in AVX maskload/maskstore arguments - worked around internally */
+#cmakedefine GMX_X86_AVX_GCC_MASKLOAD_BUG
+
 /* SSE2 was selected as CPU acceleration level */
 #cmakedefine GMX_CPU_ACCELERATION_X86_SSE2
 
diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
index 8fe321d85c..b7c65ab38c 100644
--- a/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
+++ b/src/gmxlib/nonbonded/nb_kernel_avx_128_fma_single/kernelutil_x86_avx_128_fma_single.h
@@ -227,10 +227,10 @@ gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float *
 {
     __m128 t1,t2,t3,t4;
     __m128i mask = _mm_set_epi32(0,-1,-1,-1);
-    t1             = _mm_maskload_ps(ptrA,mask);
-    t2             = _mm_maskload_ps(ptrB,mask);
-    t3             = _mm_maskload_ps(ptrC,mask);
-    t4             = _mm_maskload_ps(ptrD,mask);
+    t1             = gmx_mm_maskload_ps(ptrA,mask);
+    t2             = gmx_mm_maskload_ps(ptrB,mask);
+    t3             = gmx_mm_maskload_ps(ptrC,mask);
+    t4             = gmx_mm_maskload_ps(ptrD,mask);
     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
     *x1           = t1;
     *y1           = t2;
diff --git a/src/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h b/src/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h
index 1f2ab31be0..c35f1a409e 100644
--- a/src/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h
+++ b/src/gmxlib/nonbonded/nb_kernel_avx_256_single/kernelutil_x86_avx_256_single.h
@@ -334,10 +334,10 @@ gmx_mm256_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
 {
     __m128 t1,t2,t3,t4;
     __m128i mask = _mm_set_epi32(0,-1,-1,-1);
-    t1             = _mm_maskload_ps(ptrA,mask);
-    t2             = _mm_maskload_ps(ptrB,mask);
-    t3             = _mm_maskload_ps(ptrC,mask);
-    t4             = _mm_maskload_ps(ptrD,mask);
+    t1             = gmx_mm_maskload_ps(ptrA,mask);
+    t2             = gmx_mm_maskload_ps(ptrB,mask);
+    t3             = gmx_mm_maskload_ps(ptrC,mask);
+    t4             = gmx_mm_maskload_ps(ptrD,mask);
     _MM_TRANSPOSE4_PS(t1,t2,t3,t4);
     *x1           = _mm256_castps128_ps256(t1);
     *y1           = _mm256_castps128_ps256(t2);
@@ -431,10 +431,10 @@ gmx_mm256_load_1rvec_8ptr_swizzle_ps(const float * gmx_restrict ptrA, const floa
     __m256 t1,t2,t3,t4,t5,t6,t7,t8;
     __m128i mask = _mm_set_epi32(0,-1,-1,-1);
 
-    t1             = gmx_mm256_set_m128(_mm_maskload_ps(ptrE,mask),_mm_maskload_ps(ptrA,mask)); /*  - zE yE xE |  - zA yA xA */
-    t2             = gmx_mm256_set_m128(_mm_maskload_ps(ptrF,mask),_mm_maskload_ps(ptrB,mask)); /*  - zF yF xF |  - zB yB xB */
-    t3             = gmx_mm256_set_m128(_mm_maskload_ps(ptrG,mask),_mm_maskload_ps(ptrC,mask)); /*  - zG yG xG |  - zC yC xC */
-    t4             = gmx_mm256_set_m128(_mm_maskload_ps(ptrH,mask),_mm_maskload_ps(ptrD,mask)); /*  - zH yH xH |  - zD yD xD */
+    t1             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask)); /*  - zE yE xE |  - zA yA xA */
+    t2             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask)); /*  - zF yF xF |  - zB yB xB */
+    t3             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask)); /*  - zG yG xG |  - zC yC xC */
+    t4             = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask)); /*  - zH yH xH |  - zD yD xD */
 
     t5            = _mm256_unpacklo_ps(t1,t2); /* yF yE xF xE | yB yA xB xA */
     t6            = _mm256_unpacklo_ps(t3,t4); /* yH yG xH xG | yD yC xD xC */
@@ -594,20 +594,20 @@ gmx_mm256_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     t3          = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,2,1,0)); /*  -  z1c y1c x1c */
     t4          = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,3,3,2)); /*  -  z1d y1d x1d */
 
-    t5          = _mm_maskload_ps(ptrA,mask);
-    t6          = _mm_maskload_ps(ptrB,mask);
-    t7          = _mm_maskload_ps(ptrC,mask);
-    t8          = _mm_maskload_ps(ptrD,mask);
+    t5          = gmx_mm_maskload_ps(ptrA,mask);
+    t6          = gmx_mm_maskload_ps(ptrB,mask);
+    t7          = gmx_mm_maskload_ps(ptrC,mask);
+    t8          = gmx_mm_maskload_ps(ptrD,mask);
 
     t5          = _mm_sub_ps(t5,t1);
     t6          = _mm_sub_ps(t6,t2);
     t7          = _mm_sub_ps(t7,t3);
     t8          = _mm_sub_ps(t8,t4);
 
-    _mm_maskstore_ps(ptrA,mask,t5);
-    _mm_maskstore_ps(ptrB,mask,t6);
-    _mm_maskstore_ps(ptrC,mask,t7);
-    _mm_maskstore_ps(ptrD,mask,t8);
+    gmx_mm_maskstore_ps(ptrA,mask,t5);
+    gmx_mm_maskstore_ps(ptrB,mask,t6);
+    gmx_mm_maskstore_ps(ptrC,mask,t7);
+    gmx_mm_maskstore_ps(ptrD,mask,t8);
 }
 
 
@@ -762,10 +762,10 @@ gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     /* Construct a mask without executing any data loads */
     mask        = _mm_blend_epi16(_mm_setzero_si128(),_mm_cmpeq_epi16(_mm_setzero_si128(),_mm_setzero_si128()),0x3F);
 
-    tA          = gmx_mm256_set_m128(_mm_maskload_ps(ptrE,mask),_mm_maskload_ps(ptrA,mask));
-    tB          = gmx_mm256_set_m128(_mm_maskload_ps(ptrF,mask),_mm_maskload_ps(ptrB,mask));
-    tC          = gmx_mm256_set_m128(_mm_maskload_ps(ptrG,mask),_mm_maskload_ps(ptrC,mask));
-    tD          = gmx_mm256_set_m128(_mm_maskload_ps(ptrH,mask),_mm_maskload_ps(ptrD,mask));
+    tA          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask));
+    tB          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask));
+    tC          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask));
+    tD          = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask));
     t1          = _mm256_unpacklo_ps(x1,y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
     t2          = _mm256_unpackhi_ps(x1,y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
 
@@ -779,14 +779,14 @@ gmx_mm256_decrement_1rvec_8ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx
     tC          = _mm256_sub_ps(tC,t5);
     tD          = _mm256_sub_ps(tD,t6);
 
-    _mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
-    _mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
-    _mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
-    _mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
-    _mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
-    _mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
-    _mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
-    _mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
+    gmx_mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
+    gmx_mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
+    gmx_mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
+    gmx_mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
+    gmx_mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
+    gmx_mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
+    gmx_mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
+    gmx_mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
 }
 
 
-- 
2.22.0