set(PKG_CFLAGS "${PKG_CFLAGS} -DGMX_INTERNAL_XDR")
endif(NOT GMX_SYSTEM_XDR)
+# include avx test source, used if the AVX flags are set below
+include(gmxTestAVXMaskload)
+
# Process nonbonded accelerated kernels settings
string(TOUPPER ${GMX_CPU_ACCELERATION} ${GMX_CPU_ACCELERATION})
if(${GMX_CPU_ACCELERATION} STREQUAL "NONE")
endif()
endif()
+ # Unfortunately gcc-4.5.2 and gcc-4.6.0 has a bug where they use the wrong datatype for the formal
+ # parameter of the mask for maskload/maskstore arguments. Check if this is present, since we can work around it.
+ gmx_test_avx_gcc_maskload_bug(${ACCELERATION_C_FLAGS} GMX_X86_AVX_GCC_MASKLOAD_BUG)
+
elseif(${GMX_CPU_ACCELERATION} STREQUAL "BLUEGENE")
# GMX_CPU_ACCELERATION=BlueGene should be set in the Toolchain-BlueGene?-???.cmake file
if (NOT ACCELERATION_QUIETLY)
--- /dev/null
+#include<immintrin.h>
+int main()
+{
+ __m256d a;
+ __m256i mask;
+ double d[4]={1,2,3,4};
+
+ a = _mm256_setzero_pd();
+ mask = _mm256_castpd_si256(a);
+
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+ a = _mm256_maskload_pd(d,_mm256_castsi256_pd(mask));
+#else
+ a = _mm256_maskload_pd(d,mask);
+#endif
+}
+
--- /dev/null
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2012, by the GROMACS development team, led by
+# David van der Spoel, Berk Hess, Erik Lindahl, and including many
+# others, as listed in the AUTHORS file in the top-level source
+# directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+#
+# GMX_TEST_AVX_GCC_MASKLOAD_BUG(VARIABLE)
+#
+# VARIABLE will be set if the compiler is a buggy version
+# of GCC (prior to 4.5.3, and maybe 4.6) that has an incorrect second
+# argument to the AVX _mm256_maskload_ps() intrinsic.
+#
+# You need to use this variable in a cmakedefine, and then handle
+# the case separately in your code - no automatic cure, unfortunately.
+#
+MACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG AVX_CFLAGS VARIABLE)
+ IF(NOT DEFINED ${VARIABLE})
+ MESSAGE(STATUS "Checking for gcc AVX maskload bug")
+ # some compilers like clang accept both cases,
+ # so first try a normal compile to avoid flagging those as buggy.
+ TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
+ "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
+ COMPILE_DEFINITIONS "${AVX_CFLAGS}" )
+ IF(${VARIABLE}_COMPILEOK)
+ SET(${VARIABLE} 0 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
+ MESSAGE(STATUS "Checking for gcc AVX maskload bug - not present")
+ ELSE()
+ TRY_COMPILE(${VARIABLE}_COMPILEOK "${CMAKE_BINARY_DIR}"
+ "${CMAKE_SOURCE_DIR}/cmake/TestAVXMaskload.c"
+ COMPILE_DEFINITIONS "${AVX_CFLAGS} -DGMX_X86_AVX_GCC_MASKLOAD_BUG" )
+ IF(${VARIABLE}_COMPILEOK)
+ SET(${VARIABLE} 1 CACHE INTERNAL "Work around GCC bug in AVX maskload argument" FORCE)
+ MESSAGE(STATUS "Checking for gcc AVX maskload bug - found, will try to work around")
+ ELSE()
+ MESSAGE(WARNING "Cannot compile AVX code - assuming gcc AVX maskload bug not present." )
+ MESSAGE(STATUS "Checking for gcc AVX maskload bug - not present")
+ ENDIF()
+ ENDIF()
+ ENDIF(NOT DEFINED ${VARIABLE})
+ENDMACRO(GMX_TEST_AVX_GCC_MASKLOAD_BUG VARIABLE)
+
+
+
+
return sse_overflow;
}
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+# define gmx_mm_maskload_ps(mem,mask) _mm_maskload_ps((mem),_mm_castsi128_ps(mask))
+# define gmx_mm_maskstore_ps(mem,mask,x) _mm_maskstore_ps((mem),_mm_castsi128_ps(mask),(x))
+# define gmx_mm256_maskload_ps(mem,mask) _mm256_maskload_ps((mem),_mm256_castsi256_ps(mask))
+# define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),_mm256_castsi256_ps(mask),(x))
+#else
+# define gmx_mm_maskload_ps(mem,mask) _mm_maskload_ps((mem),(mask))
+# define gmx_mm_maskstore_ps(mem,mask,x) _mm_maskstore_ps((mem),(mask),(x))
+# define gmx_mm256_maskload_ps(mem,mask) _mm256_maskload_ps((mem),(mask))
+# define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),(mask),(x))
+#endif
+
#endif /* _gmx_x86_avx_128_fma_h_ */
return sse_overflow;
}
+/* Work around gcc bug with wrong type for mask formal parameter to maskload/maskstore */
+#ifdef GMX_X86_AVX_GCC_MASKLOAD_BUG
+# define gmx_mm_maskload_ps(mem,mask) _mm_maskload_ps((mem),_mm_castsi128_ps(mask))
+# define gmx_mm_maskstore_ps(mem,mask,x) _mm_maskstore_ps((mem),_mm_castsi128_ps(mask),(x))
+# define gmx_mm256_maskload_ps(mem,mask) _mm256_maskload_ps((mem),_mm256_castsi256_ps(mask))
+# define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),_mm256_castsi256_ps(mask),(x))
+#else
+# define gmx_mm_maskload_ps(mem,mask) _mm_maskload_ps((mem),(mask))
+# define gmx_mm_maskstore_ps(mem,mask,x) _mm_maskstore_ps((mem),(mask),(x))
+# define gmx_mm256_maskload_ps(mem,mask) _mm256_maskload_ps((mem),(mask))
+# define gmx_mm256_maskstore_ps(mem,mask,x) _mm256_maskstore_ps((mem),(mask),(x))
+#endif
#endif /* _gmx_x86_avx_256_h_ */
/* AVX 256-bit instructions available */
#cmakedefine GMX_X86_AVX_256
+/* GCC bug in AVX maskload/maskstore arguments - worked around internally */
+#cmakedefine GMX_X86_AVX_GCC_MASKLOAD_BUG
+
/* SSE2 was selected as CPU acceleration level */
#cmakedefine GMX_CPU_ACCELERATION_X86_SSE2
{
__m128 t1,t2,t3,t4;
__m128i mask = _mm_set_epi32(0,-1,-1,-1);
- t1 = _mm_maskload_ps(ptrA,mask);
- t2 = _mm_maskload_ps(ptrB,mask);
- t3 = _mm_maskload_ps(ptrC,mask);
- t4 = _mm_maskload_ps(ptrD,mask);
+ t1 = gmx_mm_maskload_ps(ptrA,mask);
+ t2 = gmx_mm_maskload_ps(ptrB,mask);
+ t3 = gmx_mm_maskload_ps(ptrC,mask);
+ t4 = gmx_mm_maskload_ps(ptrD,mask);
_MM_TRANSPOSE4_PS(t1,t2,t3,t4);
*x1 = t1;
*y1 = t2;
{
__m128 t1,t2,t3,t4;
__m128i mask = _mm_set_epi32(0,-1,-1,-1);
- t1 = _mm_maskload_ps(ptrA,mask);
- t2 = _mm_maskload_ps(ptrB,mask);
- t3 = _mm_maskload_ps(ptrC,mask);
- t4 = _mm_maskload_ps(ptrD,mask);
+ t1 = gmx_mm_maskload_ps(ptrA,mask);
+ t2 = gmx_mm_maskload_ps(ptrB,mask);
+ t3 = gmx_mm_maskload_ps(ptrC,mask);
+ t4 = gmx_mm_maskload_ps(ptrD,mask);
_MM_TRANSPOSE4_PS(t1,t2,t3,t4);
*x1 = _mm256_castps128_ps256(t1);
*y1 = _mm256_castps128_ps256(t2);
__m256 t1,t2,t3,t4,t5,t6,t7,t8;
__m128i mask = _mm_set_epi32(0,-1,-1,-1);
- t1 = gmx_mm256_set_m128(_mm_maskload_ps(ptrE,mask),_mm_maskload_ps(ptrA,mask)); /* - zE yE xE | - zA yA xA */
- t2 = gmx_mm256_set_m128(_mm_maskload_ps(ptrF,mask),_mm_maskload_ps(ptrB,mask)); /* - zF yF xF | - zB yB xB */
- t3 = gmx_mm256_set_m128(_mm_maskload_ps(ptrG,mask),_mm_maskload_ps(ptrC,mask)); /* - zG yG xG | - zC yC xC */
- t4 = gmx_mm256_set_m128(_mm_maskload_ps(ptrH,mask),_mm_maskload_ps(ptrD,mask)); /* - zH yH xH | - zD yD xD */
+ t1 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask)); /* - zE yE xE | - zA yA xA */
+ t2 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask)); /* - zF yF xF | - zB yB xB */
+ t3 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask)); /* - zG yG xG | - zC yC xC */
+ t4 = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask)); /* - zH yH xH | - zD yD xD */
t5 = _mm256_unpacklo_ps(t1,t2); /* yF yE xF xE | yB yA xB xA */
t6 = _mm256_unpacklo_ps(t3,t4); /* yH yG xH xG | yD yC xD xC */
t3 = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,2,1,0)); /* - z1c y1c x1c */
t4 = _mm_shuffle_ps(t4,_mm256_castps256_ps128(z1),_MM_SHUFFLE(0,3,3,2)); /* - z1d y1d x1d */
- t5 = _mm_maskload_ps(ptrA,mask);
- t6 = _mm_maskload_ps(ptrB,mask);
- t7 = _mm_maskload_ps(ptrC,mask);
- t8 = _mm_maskload_ps(ptrD,mask);
+ t5 = gmx_mm_maskload_ps(ptrA,mask);
+ t6 = gmx_mm_maskload_ps(ptrB,mask);
+ t7 = gmx_mm_maskload_ps(ptrC,mask);
+ t8 = gmx_mm_maskload_ps(ptrD,mask);
t5 = _mm_sub_ps(t5,t1);
t6 = _mm_sub_ps(t6,t2);
t7 = _mm_sub_ps(t7,t3);
t8 = _mm_sub_ps(t8,t4);
- _mm_maskstore_ps(ptrA,mask,t5);
- _mm_maskstore_ps(ptrB,mask,t6);
- _mm_maskstore_ps(ptrC,mask,t7);
- _mm_maskstore_ps(ptrD,mask,t8);
+ gmx_mm_maskstore_ps(ptrA,mask,t5);
+ gmx_mm_maskstore_ps(ptrB,mask,t6);
+ gmx_mm_maskstore_ps(ptrC,mask,t7);
+ gmx_mm_maskstore_ps(ptrD,mask,t8);
}
/* Construct a mask without executing any data loads */
mask = _mm_blend_epi16(_mm_setzero_si128(),_mm_cmpeq_epi16(_mm_setzero_si128(),_mm_setzero_si128()),0x3F);
- tA = gmx_mm256_set_m128(_mm_maskload_ps(ptrE,mask),_mm_maskload_ps(ptrA,mask));
- tB = gmx_mm256_set_m128(_mm_maskload_ps(ptrF,mask),_mm_maskload_ps(ptrB,mask));
- tC = gmx_mm256_set_m128(_mm_maskload_ps(ptrG,mask),_mm_maskload_ps(ptrC,mask));
- tD = gmx_mm256_set_m128(_mm_maskload_ps(ptrH,mask),_mm_maskload_ps(ptrD,mask));
+ tA = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrE,mask),gmx_mm_maskload_ps(ptrA,mask));
+ tB = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrF,mask),gmx_mm_maskload_ps(ptrB,mask));
+ tC = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrG,mask),gmx_mm_maskload_ps(ptrC,mask));
+ tD = gmx_mm256_set_m128(gmx_mm_maskload_ps(ptrH,mask),gmx_mm_maskload_ps(ptrD,mask));
t1 = _mm256_unpacklo_ps(x1,y1); /* y1f x1f y1e x1e | y1b x1b y1a x1a */
t2 = _mm256_unpackhi_ps(x1,y1); /* y1h x1h y1g x1g | y1d x1d y1c x1c */
tC = _mm256_sub_ps(tC,t5);
tD = _mm256_sub_ps(tD,t6);
- _mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
- _mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
- _mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
- _mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
- _mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
- _mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
- _mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
- _mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
+ gmx_mm_maskstore_ps(ptrA,mask,_mm256_castps256_ps128(tA));
+ gmx_mm_maskstore_ps(ptrB,mask,_mm256_castps256_ps128(tB));
+ gmx_mm_maskstore_ps(ptrC,mask,_mm256_castps256_ps128(tC));
+ gmx_mm_maskstore_ps(ptrD,mask,_mm256_castps256_ps128(tD));
+ gmx_mm_maskstore_ps(ptrE,mask,_mm256_extractf128_ps(tA,0x1));
+ gmx_mm_maskstore_ps(ptrF,mask,_mm256_extractf128_ps(tB,0x1));
+ gmx_mm_maskstore_ps(ptrG,mask,_mm256_extractf128_ps(tC,0x1));
+ gmx_mm_maskstore_ps(ptrH,mask,_mm256_extractf128_ps(tD,0x1));
}