option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
mark_as_advanced(GMX_FAHCORE)
-include(gmxDetectAcceleration)
-if(NOT DEFINED GMX_CPU_ACCELERATION)
- if(CMAKE_CROSSCOMPILING)
- if("${CMAKE_SYSTEM_NAME}" MATCHES "BlueGeneQ")
- set(GMX_SUGGESTED_CPU_ACCELERATION "IBM_QPX")
- else()
- set(GMX_SUGGESTED_CPU_ACCELERATION "None")
- endif()
- else(CMAKE_CROSSCOMPILING)
- gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
- endif(CMAKE_CROSSCOMPILING)
-endif(NOT DEFINED GMX_CPU_ACCELERATION)
+if(NOT DEFINED GMX_CPU_ACCELERATION AND NOT CMAKE_CROSSCOMPILING)
+ include(gmxDetectAcceleration)
+ gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
+endif()
+# Detect the architecture the compiler is targetting, detect
+# acceleration possibilities on that hardware, suggest an acceleration
+# to use if none is specified, and populate the cache option for CPU
+# accleration.
+include(gmxDetectTargetArchitecture)
+gmx_detect_target_architecture()
+include(gmxDetectAcceleration)
+gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
set(GMX_CPU_ACCELERATION "@GMX_SUGGESTED_CPU_ACCELERATION@"
CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX, Sparc64_HPC_ACE")
# The user should not be able to set this orthogonally to the acceleration
set(GMX_X86_SSE2 1)
if (NOT ACCELERATION_QUIETLY)
- message(STATUS "Enabling SSE2 Gromacs acceleration, and it will help compiler optimization.")
+ message(STATUS "Enabling SSE2 Gromacs acceleration")
endif()
elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE4.1")
set(GMX_X86_SSE4_1 1)
set(GMX_X86_SSE2 1)
if (NOT ACCELERATION_QUIETLY)
- message(STATUS "Enabling SSE4.1 Gromacs acceleration, and it will help compiler optimization.")
+ message(STATUS "Enabling SSE4.1 Gromacs acceleration")
endif()
if(CMAKE_C_COMPILER_ID MATCHES "Intel" AND C_COMPILER_VERSION VERSION_EQUAL "11.1")
set(GMX_CPU_ACCELERATION_X86_AVX_128_FMA 1)
set(GMX_X86_AVX_128_FMA 1)
if (NOT ACCELERATION_QUIETLY)
- message(STATUS "Enabling 128-bit AVX Gromacs acceleration (with fused-multiply add), and it will help compiler optimization.")
+ message(STATUS "Enabling 128-bit AVX Gromacs acceleration (with fused-multiply add)")
endif()
# We don't have the full compiler version string yet (BUILD_C_COMPILER),
set(GMX_CPU_ACCELERATION_X86_AVX_256 1)
set(GMX_X86_AVX_256 1)
if (NOT ACCELERATION_QUIETLY)
- message(STATUS "Enabling 256-bit AVX Gromacs acceleration, and it will help compiler optimization.")
+ message(STATUS "Enabling 256-bit AVX Gromacs acceleration")
endif()
endif()
gmx_test_avx_gcc_maskload_bug(${ACCELERATION_C_FLAGS} GMX_X86_AVX_GCC_MASKLOAD_BUG)
elseif(${GMX_CPU_ACCELERATION} STREQUAL "IBM_QPX")
- # Used on BlueGene/Q
- if (CMAKE_C_COMPILER_ID MATCHES "XL")
- GMX_TEST_CFLAG(XLC_BLUEGENEQ_CFLAG "-qarch=qp -qtune=qp" ACCELERATION_C_FLAGS)
- try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
- "${CMAKE_SOURCE_DIR}/cmake/TestQPX.c"
- COMPILE_DEFINITIONS "${ACCELERATION_C_FLAGS}")
- if(NOT TEST_QPX)
- message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics.")
- endif()
- endif()
- if (CMAKE_CXX_COMPILER_ID MATCHES "XL" AND CMAKE_CXX_COMPILER_LOADED)
- GMX_TEST_CXXFLAG(XLC_BLUEGENEQ_CXXFLAG "-qarch=qp -qtune=qp" ACCELERATION_CXX_FLAGS)
- try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
- "${CMAKE_SOURCE_DIR}/cmake/TestQPX.c"
- COMPILE_DEFINITIONS "${ACCELERATION_CXX_FLAGS}")
- if(NOT TEST_QPX)
- message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics.")
- endif()
- endif()
+ try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
+ "${CMAKE_SOURCE_DIR}/cmake/TestQPX.c")
if (TEST_QPX)
- message(WARNING "IBM QPX acceleration was selected and could be compiled, but the accelerated kernels are not yet available.")
+ message(WARNING "IBM QPX acceleration was selected. This will work, but SIMD-accelerated kernels are only available for the Verlet cut-off scheme. The plain C kernels that are used for the group cut-off scheme kernels will be slow, so please consider using the Verlet cut-off scheme.")
set(GMX_CPU_ACCELERATION_IBM_QPX 1)
else()
- message(FATAL_ERROR "Cannot compile IBM QPX intrinsics without the XL compiler. If you are compiling for BlueGene/Q, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=BlueGeneQ-static-XL-C' to set up the tool chain.")
+ message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics. If you are compiling for BlueGene/Q with the XL compilers, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=Platform/BlueGeneQ-static-XL-C' to set up the tool chain.")
endif()
elseif(${GMX_CPU_ACCELERATION} STREQUAL "SPARC64_HPC_ACE")
set(GMX_CPU_ACCELERATION_SPARC64_HPC_ACE 1)
if(HAVE_LIBM)
list(APPEND GMX_EXTRA_LIBRARIES m)
endif(HAVE_LIBM)
+if (${CMAKE_SYSTEM_NAME} MATCHES "BlueGene")
+ check_library_exists(mass_simd atan2f4 "" HAVE_MASS_SIMD)
+ if(HAVE_MASS_SIMD)
+ list(APPEND GMX_EXTRA_LIBRARIES mass_simd)
+ else()
+ message(FATAL_ERROR "Could not link to the SIMD version of the IBM MASS library. Please adjust your CMAKE_PREFIX_PATH to contain it")
+ endif()
+endif()
if(GMX_FAHCORE)
set(COREWRAP_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/../corewrap" CACHE STRING
\end{itemize}
The respective '\verb+include+', '\verb+lib+', or '\verb+bin+' is
appended to the path. For each of these variables, a list of paths can
-be specified (on Unix seperated with ":"). Note that these are
+be specified (on Unix separated with ":"). Note that these are
enviroment variables (and not \cmake{} command-line arguments) and in
a '\verb+bash+' shell are used like:
\begin{verbatim}
accordingly. The internal versions are fine for normal use. If you
need to specify a non-standard path to search, use
\verb+-DCMAKE_PREFIX_PATH=/path/to/search+. If you need to specify a
-library with a non-standard name (e.g. ESSL on AIX), then set
-\verb+-DGMX_BLAS_USER=/path/to/reach/lib/libwhatever.a+.
+library with a non-standard name (e.g. ESSL on AIX or BlueGene), then
+set \verb+-DGMX_BLAS_USER=/path/to/reach/lib/libwhatever.a+.
If you are using Intel's \mkl{} for \fft{}, then the \blas{} and
\lapack{} it provides are used automatically. This could be
\subsubsection{BlueGene/P}
-There is currently no native acceleration on this platform, but the
-default plain C kernels will work.
+There is currently no native acceleration on this platform and no
+plans to make one. The default plain C kernels will work.
\subsubsection{BlueGene/Q}
-There is currently no native acceleration on this platform, but the
-default plain C kernels will work. We have accelerated kernels in
-progress for this platform, but they are not quite done yet.
+There is currently native acceleration on this platform for the Verlet
+cut-off scheme. Accelerated kernels for the group cut-off scheme may
+come in the future, but the default plain C kernels will work.
Only static linking with XL compilers is supported by \gromacs{}. Dynamic
linking would be supported by the architecture and \gromacs{}, but has no
above instructions.
mpicc is used for compiling and linking. This can make it awkward to
-attempt to use IBM's optimized BLAS/LAPACK called ESSL. Since mdrun is
-the only part of \gromacs{} that should normally run on the compute
-nodes, and there is nearly no need for linear algebra support for
-mdrun, it is recommended to use the \gromacs{} built-in linear algebra
-routines - it is rare for this to be a bottleneck.
-
+attempt to use IBM's optimized BLAS/LAPACK called ESSL (see the
+section on linear algebra). Since mdrun is the only part of \gromacs{}
+that should normally run on the compute nodes, and there is nearly no
+need for linear algebra support for mdrun, it is recommended to use
+the \gromacs{} built-in linear algebra routines - it is rare for this
+to be a bottleneck.
+
+The recommended configuration is to use
\begin{verbatim}
-cmake .. -DCMAKE_TOOLCHAIN_FILE=BlueGeneQ-static-XL-C \
- -DCMAKE_PREFIX_PATH=/your/fftw/installation/prefix
+cmake .. -DCMAKE_TOOLCHAIN_FILE=Platform/BlueGeneQ-static-XL-CXX \
+ -DCMAKE_PREFIX_PATH=/your/fftw/installation/prefix \
+ -DGMX_MPI=on
make mdrun
make install-mdrun
\end{verbatim}
+which will build a statically-linked MPI-enabled mdrun for the back
+end. Otherwise, GROMACS default configuration behaviour applies.
+
It is possible to configure and make the remaining \gromacs{} tools
-with the compute node toolchain, but as none of those tools are
-\mpi{}-aware, this would not normally be useful. Instead, these should
-be planned to run on the login node, and a seperate \gromacs{}
-installation performed for that using the login node's toolchain.
+with the compute-node toolchain, but as none of those tools are
+\mpi{}-aware and could then only run on the compute nodes, this
+would not normally be useful. Instead, these should be planned
+to run on the login node, and a separate \gromacs{} installation
+performed for that using the login node's toolchain - not the
+above platform file, or any other compute-node toolchain.
+
+Note that only the MPI build is available for the compute-node
+toolchains. The GROMACS thread-MPI or serial builds are not useful at
+all on BlueGene/Q.
\subsubsection{Fujitsu PRIMEHPC}
from 4.4 through 4.7, and versions 12 and 13 of the Intel compiler.
Under Windows we test both the visual studio compilers and icc,
-We test irregularly on BlueGene/L, BlueGene/P, BlueGene/Q, Cray,
+We test irregularly on BlueGene/Q, Cray,
Fujitsu PRIMEHPC, Google nativeclient and other environments. In
the future we expect ARM to be an important test target too, but this
is currently not included.
"<FLAGS> <CMAKE_${lang}_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
set(CMAKE_${lang}_LINK_EXECUTABLE
"<CMAKE_${lang}_COMPILER> ${BG/Q_${lang}_DEFAULT_EXE_FLAGS}")
+
+ if(CMAKE_BUILD_TYPE STREQUAL "Debug" AND ${compiler_id} STREQUAL "XL")
+ # Work around an unknown compiler bug triggered in
+ # compute_globals(). Using -O0 disables -qhot and this seems
+ # to break the normal OpenMP flag -qsmp unless qualified with
+ # noauto.
+ set(OpenMP_C_FLAGS "-qsmp=noauto" CACHE STRING "Compiler flag for OpenMP parallelization")
+ set(OpenMP_CXX_FLAGS "-qsmp=noauto" CACHE STRING "Compiler flag for OpenMP parallelization")
+ endif()
+
endmacro()
--- /dev/null
+int main()
+{
+#ifdef __bgq__
+ return 0;
+#else
+#error This compiler is not targetting BlueGene/Q
+#endif
+}
--- /dev/null
+int main()
+{
+#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+ return 0;
+#else
+#error This is not x86
+#endif
+}
#
# - Check the username performing the build, as well as date and time
#
-# GMX_DETECT_ACCELERATION(GMX_SUGGESTED_ACCELERATION)
+# gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
#
# Try to detect CPU information and suggest an acceleration option
-# (such as SSE/AVX) that fits the current CPU.
+# (such as SSE/AVX) that fits the current CPU. These functions assume
+# that gmx_detect_target_architecture() has already been run, so that
+# things like GMX_IS_X86 are already available.
#
-# GMX_SUGGESTED_ACCELERATION
+# Sets ${GMX_SUGGESTED_CPU_ACCELERATION} in the parent scope if
+# GMX_CPU_ACCELERATION is not set (e.g. by the user, or a previous run
+# of CMake).
#
# we rely on inline asm support for GNU!
include(gmxTestInlineASM)
-macro(gmx_detect_acceleration GMX_SUGGESTED_ACCELERATION)
- IF(NOT DEFINED ${GMX_SUGGESTED_ACCELERATION})
+function(gmx_suggest_x86_acceleration _suggested_acceleration)
gmx_test_inline_asm_gcc_x86(GMX_X86_GCC_INLINE_ASM)
try_run(GMX_CPUID_RUN_ACC GMX_CPUID_COMPILED
${CMAKE_BINARY_DIR}
${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
- COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+ COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE -DGMX_IS_X86"
RUN_OUTPUT_VARIABLE OUTPUT_TMP
COMPILE_OUTPUT_VARIABLE GMX_CPUID_COMPILE_OUTPUT
ARGS "-acceleration")
string(STRIP "@OUTPUT_TMP@" OUTPUT_ACC)
- message(STATUS "Detecting best acceleration for this CPU - @OUTPUT_ACC@")
+ set(${_suggested_acceleration} "@OUTPUT_ACC@" PARENT_SCOPE)
+ message(STATUS "Detected best acceleration for this CPU - @OUTPUT_ACC@")
+endfunction()
- set(${GMX_SUGGESTED_ACCELERATION} "@OUTPUT_ACC@" CACHE INTERNAL "GROMACS CPU-specific acceleration")
-
- ENDIF(NOT DEFINED ${GMX_SUGGESTED_ACCELERATION})
-endmacro(gmx_detect_acceleration GMX_SUGGESTED_ACCELERATION)
+function(gmx_detect_acceleration _suggested_acceleration)
+ if(NOT DEFINED GMX_CPU_ACCELERATION)
+ if(GMX_IS_BGQ)
+ set(${_suggested_acceleration} "IBM_QPX")
+ elseif(GMX_IS_X86)
+ gmx_suggest_x86_acceleration(${_suggested_acceleration})
+ else()
+ set(${_suggested_acceleration} "None")
+ endif()
+ set(${_suggested_acceleration} ${${_suggested_acceleration}} PARENT_SCOPE)
+ endif()
+endfunction()
--- /dev/null
+# - Define function to detect whether the compiler's target
+# - architecture is one for which GROMACS has special treatment
+# - (e.g. kernel acceleration)
+#
+# Sets GMX_IS_X86 or GMX_IS_BGQ if targetting that architecture
+
+function(gmx_detect_target_architecture)
+ try_compile(GMX_IS_X86 ${CMAKE_BINARY_DIR}
+ "${CMAKE_SOURCE_DIR}/cmake/TestX86.c")
+ try_compile(GMX_IS_BGQ ${CMAKE_BINARY_DIR}
+ "${CMAKE_SOURCE_DIR}/cmake/TestBlueGeneQ.c")
+endfunction()
# - with cmake <=2.8.8: compilers that accept "-dumpversion" argument:
# gcc, Intel Compiler (on Linux and Mac OS), Open64, EkoPath, clang
# (and probably other gcc-compatible compilers).
+# - with cmake <=2.8.8: xlC is not supported (it does not take -dumpversion,
+# but fortunately so far GROMACS never needs to know the version number)
#
# C_COMPILER_VERSION - version string of the current C compiler (CMAKE_C_COMPILER)
# CXX_COMPILER_VERSION - version string of the current C++ compiler (CMAKE_CXX_COMPILER)
set(_cc_dumpversion_res 0)
if (DEFINED CMAKE_C_COMPILER_VERSION AND CMAKE_VERSION VERSION_GREATER 2.8.8)
set(_cc_version ${CMAKE_C_COMPILER_VERSION})
+ elseif (CMAKE_C_COMPILER_ID MATCHES "XL")
+ set(_cc_dumpversion_res 1)
else()
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
RESULT_VARIABLE _cc_dumpversion_res
set(_cxx_dumpversion_res 0)
if (DEFINED CMAKE_CXX_COMPILER_VERSION AND CMAKE_VERSION VERSION_GREATER 2.8.8)
set(_cxx_version ${CMAKE_CXX_COMPILER_VERSION})
+ elseif (CMAKE_CXX_COMPILER_ID MATCHES "XL")
+ set(_cxx_dumpversion_res 1)
else()
execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion
RESULT_VARIABLE _cxx_dumpversion_res
# BUILD_FLAGS - [output variable] flags for the compiler
#
macro(get_compiler_info LANGUAGE BUILD_COMPILER BUILD_FLAGS)
- execute_process(COMMAND ${CMAKE_${LANGUAGE}_COMPILER} --version
+ if (CMAKE_C_COMPILER_ID MATCHES "XL")
+ set(_flag_to_query_version "-qversion")
+ else()
+ set(_flag_to_query_version "--version")
+ endif()
+ execute_process(COMMAND ${CMAKE_${LANGUAGE}_COMPILER} ${_flag_to_query_version}
RESULT_VARIABLE _exec_result
OUTPUT_VARIABLE _compiler_version
ERROR_VARIABLE _compiler_version)
- # Try executing just the compiler command --version failed
+
if(_exec_result)
+ # Try executing just the compiler command, since --version failed
execute_process(COMMAND ${CMAKE_${LANGUAGE}_COMPILER}
RESULT_VARIABLE _exec_result
OUTPUT_VARIABLE _compiler_version
# facility to run lots of jobs on small chunks of the machine. You
# certainly need proper MPI to use a whole chunk of the machine that
# the scheduler will allocate.
-set(GMX_THREAD_MPI OFF CACHE BOOL "Thread-MPI generally not compatible with BlueGene, defaulting to disabled!")
-set(GMX_MPI ON CACHE BOOL "MPI is normally required on BlueGene" FORCE)
+set(GMX_THREAD_MPI OFF CACHE BOOL "GROMACS bundled thread-MPI is not supported on BlueGene" FORCE)
+set(GMX_MPI ON CACHE BOOL "MPI is required on BlueGene" FORCE)
# Access to /etc/passwd is not available on the back end of BlueGeneP
# (at least), despite being detected by CMake. This can cause linker
if(NOT CMAKE_CROSSCOMPILING)
# Get CPU acceleration information
+ set(_compile_definitions "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE -DGMX_IS_X86")
try_run(GMX_CPUID_RUN_VENDOR GMX_CPUID_COMPILED
${CMAKE_BINARY_DIR}
${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
- COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+ COMPILE_DEFINITIONS ${_compile_definitions}
RUN_OUTPUT_VARIABLE OUTPUT_CPU_VENDOR ARGS "-vendor")
try_run(GMX_CPUID_RUN_BRAND GMX_CPUID_COMPILED
${CMAKE_BINARY_DIR}
${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
- COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+ COMPILE_DEFINITIONS ${_compile_definitions}
RUN_OUTPUT_VARIABLE OUTPUT_CPU_BRAND ARGS "-brand")
try_run(GMX_CPUID_RUN_FAMILY GMX_CPUID_COMPILED
${CMAKE_BINARY_DIR}
${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
- COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+ COMPILE_DEFINITIONS ${_compile_definitions}
RUN_OUTPUT_VARIABLE OUTPUT_CPU_FAMILY ARGS "-family")
try_run(GMX_CPUID_RUN_MODEL GMX_CPUID_COMPILED
${CMAKE_BINARY_DIR}
${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
- COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+ COMPILE_DEFINITIONS ${_compile_definitions}
RUN_OUTPUT_VARIABLE OUTPUT_CPU_MODEL ARGS "-model")
try_run(GMX_CPUID_RUN_STEPPING GMX_CPUID_COMPILED
${CMAKE_BINARY_DIR}
${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
- COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+ COMPILE_DEFINITIONS ${_compile_definitions}
RUN_OUTPUT_VARIABLE OUTPUT_CPU_STEPPING ARGS "-stepping")
try_run(GMX_CPUID_RUN_FEATURES GMX_CPUID_COMPILED
${CMAKE_BINARY_DIR}
${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
- COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+ COMPILE_DEFINITIONS ${_compile_definitions}
RUN_OUTPUT_VARIABLE OUTPUT_CPU_FEATURES ARGS "-features")
+ unset(_compile_definitions)
string(STRIP "@OUTPUT_CPU_VENDOR@" OUTPUT_CPU_VENDOR)
string(STRIP "@OUTPUT_CPU_BRAND@" OUTPUT_CPU_BRAND)
# GMX_TEST_INLINE_ASM_GCC_X86(VARIABLE)
#
# VARIABLE will be set to true if GCC x86 inline asm works.
-#
-# Remember to have a cmakedefine for it too...
MACRO(GMX_TEST_INLINE_ASM_GCC_X86 VARIABLE)
IF(NOT DEFINED ${VARIABLE})
#define gmx_simd4_pb gmx_simd4_ref_pb
#define gmx_simd4_load_pr gmx_simd4_ref_load_pr
+#define gmx_simd4_load_bb_pr gmx_simd4_ref_load_pr
#define gmx_simd4_set1_pr gmx_simd4_ref_set1_pr
#define gmx_simd4_setzero_pr gmx_simd4_ref_setzero_pr
#define gmx_simd4_store_pr gmx_simd4_ref_store_pr
#define gmx_simd4_pb __m128
#define gmx_simd4_load_pr _mm_load_ps
+#define gmx_simd4_load_bb_pr _mm_load_ps
#define gmx_simd4_set1_pr _mm_set1_ps
#define gmx_simd4_setzero_pr _mm_setzero_ps
#define gmx_simd4_store_pr _mm_store_ps
#define gmx_simd4_pb __m256d
#define gmx_simd4_load_pr _mm256_load_pd
+#define gmx_simd4_load_bb_pr _mm256_load_pd
#define gmx_simd4_set1_pr _mm256_set1_pd
#define gmx_simd4_setzero_pr _mm256_setzero_pd
#define gmx_simd4_store_pr _mm256_store_pd
#endif /* GMX_HAVE_SIMD4_MACROS */
-
#endif /* GMX_X86_SSE2 */
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+/* i.e. BlueGene/Q */
+
+/* This hack works on the compilers that can reach this code. A real
+ solution with broader scope will be proposed in master branch. */
+#define gmx_always_inline __attribute__((always_inline))
+
+#ifdef GMX_SIMD4_SINGLE
+#define GMX_HAVE_SIMD4_MACROS
+#endif
+
+typedef vector4double gmx_simd4_pr;
+typedef vector4double gmx_simd4_pb;
+
+/* The declarations of vec_ld* use non-const pointers, and IBM
+ can't/won't fix this any time soon. So GROMACS has to cast away the
+ const-ness of its pointers before loads. Four-wide SIMD loads
+ sometimes occur from variables of type real, and sometimes from
+ variables of type float (even at double precison), so the correct
+ cast cannot be done easily. The correct cast is necessary because
+ the resulting type determines the alignment assumption of vec_ld*,
+ which is different for float and double. So the loads of
+ always-float variables have to be done with a function that does
+ the correct cast. Since functions cannot be overloaded by type in
+ C, they have to have different names. Thus we have
+ gmx_simd4_load_pr and gmx_simd4_load_bb_pr.
+ */
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_pr(const real *a)
+{
+#ifdef NDEBUG
+ return vec_ld(0, (real *) a);
+#else
+ return vec_lda(0, (real *) a);
+#endif
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_bb_pr(const float *a)
+{
+#ifdef NDEBUG
+ return vec_ld(0, (float *) a);
+#else
+ return vec_lda(0, (float *) a);
+#endif
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_set1_pr(const real a)
+{
+ return vec_splats(a);
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_setzero_pr()
+{
+ return vec_splats(0.0);
+}
+
+/* TODO this will not yet work, because the function might be passed a
+ pointer to a float when running in double precision.
+ */
+static gmx_inline void gmx_always_inline gmx_simd4_store_pr(real *a, gmx_simd4_pr b)
+{
+#ifdef NDEBUG
+ vec_st(b, 0, a);
+#else
+ vec_sta(b, 0, a);
+#endif
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_add_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+ return vec_add(a, b);
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_sub_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+ return vec_sub(a, b);
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_mul_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+ return vec_mul(a, b);
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_madd_pr(gmx_simd4_pr a, gmx_simd4_pr b, gmx_simd4_pr c)
+{
+ return vec_madd(a, b, c);
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_nmsub_pr(gmx_simd4_pr a, gmx_simd4_pr b, gmx_simd4_pr c)
+{
+ return vec_nmsub(a, b, c);
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_min_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+ /* Implemented the same way as max, but with the subtraction
+ operands swapped. */
+ return vec_sel(b, a, vec_sub(b, a));
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_max_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+ return vec_sel(b, a, vec_sub(a, b));
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_blendzero_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+ return vec_sel(gmx_setzero_pr(), a, b);
+}
+
+static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_cmplt_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+ return vec_cmplt(a, b);
+}
+
+static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_and_pb(gmx_simd4_pb a, gmx_simd4_pb b)
+{
+ return vec_and(a, b);
+}
+
+static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_or_pb(gmx_simd4_pb a, gmx_simd4_pb b)
+{
+ return vec_or(a, b);
+}
+
+static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+ /* The dot product is done solely on the QPX AXU (which is the
+ only available FPU). This is awkward, because pretty much no
+ "horizontal" SIMD-vector operations exist, unlike x86 where
+ SSE4.1 added various kinds of horizontal operations. So we have
+ to make do with shifting vector elements and operating on the
+ results. This makes for lots of data dependency, but the main
+ alternative of storing to memory and reloading is not going to
+ help, either. OpenMP over 2 or 4 hardware threads per core will
+ hide much of the latency from the data dependency. The
+ vec_extract() lets the compiler correctly use a floating-point
+ comparison on the zeroth vector element, which avoids needing
+ memory at all.
+ */
+
+ gmx_simd4_pr dp_shifted_left_0 = vec_mul(a, b);
+ gmx_simd4_pr dp_shifted_left_1 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 1);
+ gmx_simd4_pr dp_shifted_left_2 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 2);
+ gmx_simd4_pr dp = vec_add(dp_shifted_left_2,
+ vec_add(dp_shifted_left_0, dp_shifted_left_1));
+
+ /* See comment in nbnxn_make_pairlist_part() about how this should
+ be able to return a double on PowerPC. */
+ return (float) vec_extract(dp, 0);
+}
+
+static gmx_inline int gmx_always_inline gmx_simd4_anytrue_pb(gmx_simd4_pb a)
+{
+ return gmx_anytrue_pb(a);
+}
+
+#undef gmx_always_inline
+
+#endif /* GMX_CPU_ACCELERATION_IBM_QPX */
#ifdef GMX_HAVE_SIMD4_MACROS
/* Generic functions to extract a SIMD4 aligned pointer from a pointer x.
#endif
#endif
+#ifdef GMX_IS_X86
#ifdef GMX_X86_SSE2
/* This is for general x86 SIMD instruction sets that also support SSE2 */
#include "gmx_x86_avx_256.h"
#ifdef GMX_DOUBLE
#include "gmx_math_x86_avx_256_double.h"
-#else
+#else /* GMX_DOUBLE */
#include "gmx_math_x86_avx_256_single.h"
-#endif
-#else
+#endif /* GMX_DOUBLE */
+#else /* GMX_X86_AVX_256 */
#ifdef GMX_X86_AVX_128_FMA
#include "gmx_x86_avx_128_fma.h"
#ifdef GMX_DOUBLE
#include "gmx_math_x86_avx_128_fma_double.h"
-#else
+#else /* GMX_DOUBLE */
#include "gmx_math_x86_avx_128_fma_single.h"
-#endif
-#else
+#endif /* GMX_DOUBLE */
+#else /* GMX_X86_AVX_128_FMA */
#ifdef GMX_X86_SSE4_1
#include "gmx_x86_sse4_1.h"
#ifdef GMX_DOUBLE
#include "gmx_math_x86_sse4_1_double.h"
-#else
+#else /* GMX_DOUBLE */
#include "gmx_math_x86_sse4_1_single.h"
-#endif
-#else
+#endif /* GMX_DOUBLE */
+#else /* GMX_X86_SSE4_1 */
#ifdef GMX_X86_SSE2
#include "gmx_x86_sse2.h"
#ifdef GMX_DOUBLE
#include "gmx_math_x86_sse2_double.h"
-#else
+#else /* GMX_DOUBLE */
#include "gmx_math_x86_sse2_single.h"
-#endif
-#else
+#endif /* GMX_DOUBLE */
+#else /* GMX_X86_SSE2 */
#error No x86 acceleration defined
-#endif
-#endif
-#endif
-#endif
+#endif /* GMX_X86_SSE2 */
+#endif /* GMX_X86_SSE4_1 */
+#endif /* GMX_X86_AVX_128_FMA */
+#endif /* GMX_X86_AVX_256 */
+
/* exp and trigonometric functions are included above */
#define GMX_SIMD_HAVE_EXP
#define GMX_SIMD_HAVE_TRIGONOMETRIC
return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
};
-static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
+static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+{
+ return _mm_add_ps(b, _mm_andnot_ps(a, c));
+};
#define gmx_anytrue_pb _mm_movemask_ps
return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
};
-static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
+static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+{
+ return _mm_add_pd(b, _mm_andnot_pd(a, c));
+};
#define gmx_cmplt_pr _mm_cmplt_pd
return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
};
-static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
+static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+{
+ return _mm256_add_ps(b, _mm256_andnot_ps(a, c));
+};
/* Less-than (we use ordered, non-signaling, but that's not required) */
#define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
};
-static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
+static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+{
+ return _mm256_add_pd(b, _mm256_andnot_pd(a, c));
+};
/* Less-than (we use ordered, non-signaling, but that's not required) */
#define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
#endif /* GMX_X86_SSE2 */
+#endif /* GMX_IS_X86 */
+
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+
+/* This hack works on the compilers that can reach this code. A real
+ solution with broader scope will be proposed in master branch. */
+#define gmx_always_inline __attribute__((always_inline))
+
+/* This is for the A2 core on BlueGene/Q that supports IBM's QPX
+ vector built-in functions */
+#define GMX_HAVE_SIMD_MACROS
+#ifdef __clang__
+#include <qpxmath.h>
+#else
+#include "mass_simd.h"
+#endif
+
+/* No need to version the code by the precision, because the QPX AXU
+ extends to and truncates from double precision for free. */
+
+#define GMX_SIMD_WIDTH_HERE 4
+typedef vector4double gmx_mm_pr;
+typedef vector4double gmx_mm_pb;
+typedef vector4double gmx_epi32;
+#define GMX_SIMD_EPI32_WIDTH 4
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_load_pr(const real *a)
+{
+#ifdef NDEBUG
+ return vec_ld(0, (real *) a);
+#else
+ return vec_lda(0, (real *) a);
+#endif
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_load1_pr(const real *a)
+{
+ return vec_splats(*a);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_set1_pr(real a)
+{
+ return vec_splats(a);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_setzero_pr()
+{
+ return vec_splats(0.0);
+}
+
+static gmx_inline void gmx_always_inline gmx_store_pr(real *a, gmx_mm_pr b)
+{
+#ifdef NDEBUG
+ vec_st(b, 0, a);
+#else
+ vec_sta(b, 0, a);
+#endif
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_add_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+ return vec_add(a, b);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_sub_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+ return vec_sub(a, b);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_mul_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+ return vec_mul(a, b);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_madd_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c)
+{
+ return vec_madd(a, b, c);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_nmsub_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c)
+{
+ return vec_nmsub(a, b, c);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_max_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+ return vec_sel(b, a, vec_sub(a, b));
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_blendzero_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+ return vec_sel(gmx_setzero_pr(), a, b);
+}
+
+static gmx_inline gmx_mm_pb gmx_always_inline gmx_cmplt_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+ return vec_cmplt(a, b);
+}
+
+static gmx_inline gmx_mm_pb gmx_always_inline gmx_and_pb(gmx_mm_pb a, gmx_mm_pb b)
+{
+ return vec_and(a, b);
+}
+
+static gmx_inline gmx_mm_pb gmx_always_inline gmx_or_pb(gmx_mm_pb a, gmx_mm_pb b)
+{
+ return vec_or(a, b);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_round_pr(gmx_mm_pr a)
+{
+ return vec_round(a);
+}
+
+#define GMX_SIMD_HAVE_FLOOR
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_floor_pr(gmx_mm_pr a)
+{
+ return vec_floor(a);
+}
+
+#define GMX_SIMD_HAVE_BLENDV
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_blendv_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c)
+{
+ return vec_sel(b, a, gmx_cmplt_pr(gmx_setzero_pr(), c));
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+ return vec_cpsgn(a, b);
+};
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+{
+ return vec_add(b, vec_sel(c, gmx_setzero_pr(), a));
+};
+
+static gmx_inline gmx_bool gmx_always_inline
+GMX_SIMD_IS_TRUE(real x)
+{
+ return x >= 0.0;
+}
+
+static gmx_inline gmx_epi32 gmx_always_inline gmx_cvttpr_epi32(gmx_mm_pr a)
+{
+ return vec_ctiwuz(a);
+}
+/* Don't want this, we have floor */
+/* #define gmx_cvtepi32_pr vec_cvtepi32 */
+
+/* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA
+ Architecture only promises 2^-8. So probably no need for
+ Newton-Raphson iterates at single or double. */
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_rsqrt_pr(gmx_mm_pr a)
+{
+ return vec_rsqrte(a);
+}
+
+/* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA
+ Architecture only promises 2^-5. So probably no need for
+ Newton-Raphson iterates at single or double. */
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_rcp_pr(gmx_mm_pr a)
+{
+ return vec_re(a);
+}
+
+/* Note that here, and below, we use the built-in SLEEF port when
+ compiling on BlueGene/Q with clang */
+
+#define GMX_SIMD_HAVE_EXP
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_exp_pr(gmx_mm_pr a)
+{
+#ifdef __clang__
+#ifndef GMX_DOUBLE
+ return xexpf(a);
+#else
+ return xexp(a);
+#endif
+#else
+#ifndef GMX_DOUBLE
+ return expf4(a);
+#else
+ return expd4(a);
+#endif
+#endif
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_sqrt_pr(gmx_mm_pr a)
+{
+#ifdef NDEBUG
+ return vec_swsqrt_nochk(a);
+#else
+ return vec_swsqrt(a);
+#endif
+}
+
+#define GMX_SIMD_HAVE_TRIGONOMETRIC
+static gmx_inline int gmx_always_inline gmx_sincos_pr(gmx_mm_pr a, gmx_mm_pr *b, gmx_mm_pr *c)
+{
+#ifdef __clang__
+#ifndef GMX_DOUBLE
+ xsincosf(a, b, c);
+#else
+ xsincos(a, b, c);
+#endif
+#else
+#ifndef GMX_DOUBLE
+ sincosf4(a, b, c);
+#else
+ sincosd4(a, b, c);
+#endif
+#endif
+ return 1;
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_acos_pr(gmx_mm_pr a)
+{
+#ifdef __clang__
+#ifndef GMX_DOUBLE
+ return xacosf(a);
+#else
+ return xacos(a);
+#endif
+#else
+#ifndef GMX_DOUBLE
+ return acosf4(a);
+#else
+ return acosd4(a);
+#endif
+#endif
+}
+
+/* NB The order of parameters here is correct; the
+ documentation of atan2[df]4 in SIMD MASS is wrong. */
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_atan2_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+#ifdef __clang__
+#ifndef GMX_DOUBLE
+ return xatan2f(a, b);
+#else
+ return xatan2(a, b);
+#endif
+#else
+#ifndef GMX_DOUBLE
+ return atan2f4(a, b);
+#else
+ return atan2d4(a, b);
+#endif
+#endif
+}
+
+static gmx_inline int gmx_always_inline
+gmx_anytrue_pb(gmx_mm_pb a)
+{
+ /* The "anytrue" is done solely on the QPX AXU (which is the only
+ available FPU). This is awkward, because pretty much no
+ "horizontal" SIMD-vector operations exist, unlike x86 where
+ SSE4.1 added various kinds of horizontal operations. So we have
+ to make do with shifting vector elements and operating on the
+ results. This makes for lots of data dependency, but the main
+ alternative of storing to memory and reloading is not going to
+ help, either. OpenMP over 2 or 4 hardware threads per core will
+ hide much of the latency from the data dependency. The
+ vec_extract() lets the compiler correctly use a floating-point
+ comparison on the zeroth vector element, which avoids needing
+ memory at all.
+ */
+ gmx_mm_pb vec_shifted_left_0 = a;
+ gmx_mm_pb vec_shifted_left_1 = vec_sldw(a, a, 1);
+ gmx_mm_pb vec_shifted_left_2 = vec_sldw(a, a, 2);
+ gmx_mm_pb vec_shifted_left_3 = vec_sldw(a, a, 3);
+
+ gmx_mm_pb vec_return = vec_or(vec_or(vec_shifted_left_2, vec_shifted_left_3),
+ vec_or(vec_shifted_left_0, vec_shifted_left_1));
+ return (0.0 < vec_extract(vec_return, 0));
+};
+
+#undef gmx_always_inline
+
+#endif /* GMX_CPU_ACCELERATION_IBM_QPX */
#ifdef GMX_HAVE_SIMD_MACROS
/* Generic functions to extract a SIMD aligned pointer from a pointer x.
#include "gmx_simd_math_single.h"
#endif
+
#endif /* GMX_HAVE_SIMD_MACROS */
#endif /* _gmx_simd_macros_h_ */
return c;
}
-/* Logical AND on SIMD booleans */
-static gmx_inline gmx_simd_ref_pb
+/* Logical AND on SIMD booleans. Can't be static or it can't be a
+ template parameter (at least on XLC for BlueGene/Q) */
+gmx_inline gmx_simd_ref_pb
gmx_simd_ref_and_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
{
gmx_simd_ref_pb c;
return c;
}
-/* Logical OR on SIMD booleans */
-static gmx_inline gmx_simd_ref_pb
+/* Logical OR on SIMD booleans. Can't be static or it can't be a
+ template parameter (at least on XLC for BlueGene/Q) */
+gmx_inline gmx_simd_ref_pb
gmx_simd_ref_or_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
{
gmx_simd_ref_pb c;
GMX_LIBGMX_EXPORT
int gmx_hostname_num(void);
-/* If the first part of the hostname (up to the first dot) ends with a number, returns this number.
- If the first part of the hostname does not ends in a number (0-9 characters), returns 0.
+/* Ostensibly, returns a integer characteristic of and unique to each
+ physical node in the MPI system. If the first part of the MPI
+ hostname (up to the first dot) ends with a number, returns this
+ number. If the first part of the MPI hostname does not ends in a
+ number (0-9 characters), returns 0.
*/
GMX_LIBGMX_EXPORT
/* #define GMX_NBNXN_SIMD_2XNN */
-#ifdef GMX_X86_SSE2
+#if (defined GMX_X86_SSE2) || (defined GMX_CPU_ACCELERATION_IBM_QPX)
/* Use SIMD accelerated nbnxn search and kernels */
#define GMX_NBNXN_SIMD
#ifndef _nbnxn_pairlist_h
#define _nbnxn_pairlist_h
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif
* is found, all subsequent j-entries in the i-entry also have full masks.
*/
typedef struct {
- int cj; /* The j-cluster */
- unsigned excl; /* The topology exclusion (interaction) bits */
+ int cj; /* The j-cluster */
+ unsigned excl; /* The exclusion (interaction) bits */
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+ /* Indices into the arrays of SIMD interaction masks. */
+ char interaction_mask_indices[4];
+#endif
} nbnxn_cj_t;
/* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
*/
unsigned *simd_exclusion_filter1;
unsigned *simd_exclusion_filter2;
-
- int nout; /* The number of force arrays */
- nbnxn_atomdata_output_t *out; /* Output data structures */
- int nalloc; /* Allocation size of all arrays (for x/f *x/fstride) */
- gmx_bool bUseBufferFlags; /* Use the flags or operate on all atoms */
- nbnxn_buffer_flags_t buffer_flags; /* Flags for buffer zeroing+reduc. */
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+ real *simd_interaction_array; /* Array of masks needed for exclusions on QPX */
+#endif
+ int nout; /* The number of force arrays */
+ nbnxn_atomdata_output_t *out; /* Output data structures */
+ int nalloc; /* Allocation size of all arrays (for x/f *x/fstride) */
+ gmx_bool bUseBufferFlags; /* Use the flags or operate on all atoms */
+ nbnxn_buffer_flags_t buffer_flags; /* Flags for buffer zeroing+reduc. */
} nbnxn_atomdata_t;
#ifdef __cplusplus
/* Use AMD core math library */
#cmakedefine GMX_FFT_ACML
+/* Target platform is x86 or x86_64 */
+#cmakedefine GMX_IS_X86
+
+/* Target platform is BlueGene/Q */
+#cmakedefine GMX_IS_BGQ
+
/* SSE2 instructions available */
#cmakedefine GMX_X86_SSE2
/* For convenience, and to enable configure-time invocation, we keep all architectures
* in a single file, but to avoid repeated ifdefs we set the overall architecture here.
*/
-#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+#ifdef GMX_IS_X86
/* OK, it is x86, but can we execute cpuid? */
#if defined(GMX_X86_GCC_INLINE_ASM) || ( defined(_MSC_VER) && ( (_MSC_VER > 1500) || (_MSC_VER==1500 & _MSC_FULL_VER >= 150030729)))
# define GMX_CPUID_X86
#endif
}
+#if defined GMX_LIB_MPI && defined GMX_IS_BGQ
+#include <spi/include/kernel/location.h>
+#endif
int gmx_hostname_num()
{
char mpi_hostname[MPI_MAX_PROCESSOR_NAME], hostnum_str[MPI_MAX_PROCESSOR_NAME];
MPI_Get_processor_name(mpi_hostname, &resultlen);
+#ifdef GMX_IS_BGQ
+ Personality_t personality;
+ Kernel_GetPersonality(&personality, sizeof(personality));
+ /* Each MPI rank has a unique coordinate in a 6-dimensional space
+ (A,B,C,D,E,T), with dimensions A-E corresponding to different
+ physical nodes, and T within each node. Each node has sixteen
+ physical cores, each of which can have up to four hardware
+ threads, so 0 <= T <= 63 (but the maximum value of T depends on
+ the confituration of ranks and OpenMP threads per
+ node). However, T is irrelevant for computing a suitable return
+ value for gmx_hostname_num().
+ */
+ hostnum = personality.Network_Config.Acoord;
+ hostnum *= personality.Network_Config.Bnodes;
+ hostnum += personality.Network_Config.Bcoord;
+ hostnum *= personality.Network_Config.Cnodes;
+ hostnum += personality.Network_Config.Ccoord;
+ hostnum *= personality.Network_Config.Dnodes;
+ hostnum += personality.Network_Config.Dcoord;
+ hostnum *= personality.Network_Config.Enodes;
+ hostnum += personality.Network_Config.Ecoord;
+#else
/* This procedure can only differentiate nodes with host names
* that end on unique numbers.
*/
/* Use only the last 9 decimals, so we don't overflow an int */
hostnum = strtol(hostnum_str + max(0, j-9), NULL, 10);
}
+#endif
if (debug)
{
- fprintf(debug, "In gmx_setup_nodecomm: hostname '%s', hostnum %d\n",
+ fprintf(debug, "In gmx_hostname_num: hostname '%s', hostnum %d\n",
mpi_hostname, hostnum);
+#ifdef GMX_IS_BGQ
+ fprintf(debug,
+ "Torus ID A: %d / %d B: %d / %d C: %d / %d D: %d / %d E: %d / %d\nNode ID T: %d / %d core: %d / %d hardware thread: %d / %d\n",
+ personality.Network_Config.Acoord,
+ personality.Network_Config.Anodes,
+ personality.Network_Config.Bcoord,
+ personality.Network_Config.Bnodes,
+ personality.Network_Config.Ccoord,
+ personality.Network_Config.Cnodes,
+ personality.Network_Config.Dcoord,
+ personality.Network_Config.Dnodes,
+ personality.Network_Config.Ecoord,
+ personality.Network_Config.Enodes,
+ Kernel_ProcessorCoreID(),
+ 16,
+ Kernel_ProcessorID(),
+ 64,
+ Kernel_ProcessorThreadID(),
+ 4);
+#endif
}
return hostnum;
#endif
gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
}
}
+#ifdef GMX_IS_BGQ
+ else
+ {
+ md_print_warn(cr, fplog,
+ "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
+ " BlueGene/Q. You will observe better performance from using the\n"
+ " Verlet cut-off scheme.\n");
+ }
+#endif
}
#ifndef GMX_THREAD_MPI
if (PAR(cr))
#endif
}
- /* Analytical Ewald exclusion correction is only an option in the
- * x86 SIMD kernel. This is faster in single precision
- * on Bulldozer and slightly faster on Sandy Bridge.
+ /* Analytical Ewald exclusion correction is only an option in
+ * the SIMD kernel. On BlueGene/Q, this is faster regardless
+ * of precision. In single precision, this is faster on
+ * Bulldozer, and slightly faster on Sandy Bridge.
*/
-#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
+#if ((defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE) || (defined GMX_CPU_ACCELERATION_IBM_QPX)
*ewald_excl = ewaldexclAnalytical;
#endif
if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
}
}
-#endif /* GMX_X86_SSE2 */
+#endif /* GMX_NBNXN_SIMD */
}
#endif
#endif
#endif
-#else /* GMX_X86_SSE2 */
+#else /* GMX_X86_SSE2 */
/* not GMX_X86_SSE2, but other SIMD */
returnvalue = "SIMD";
#endif /* GMX_X86_SSE2 */
-#else /* GMX_NBNXN_SIMD */
+#else /* GMX_NBNXN_SIMD */
returnvalue = "not available";
#endif /* GMX_NBNXN_SIMD */
break;
#include "nbnxn_consts.h"
#include "nbnxn_internal.h"
#include "nbnxn_search.h"
-#include "nbnxn_atomdata.h"
#include "gmx_omp_nthreads.h"
/* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
}
}
+#ifdef GMX_NBNXN_SIMD
+static void
+nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t *nbat)
+{
+ int i, j;
+ const int simd_width = GMX_SIMD_WIDTH_HERE;
+ int simd_excl_size;
+ /* Set the diagonal cluster pair exclusion mask setup data.
+ * In the kernel we check 0 < j - i to generate the masks.
+ * Here we store j - i for generating the mask for the first i,
+ * we substract 0.5 to avoid rounding issues.
+ * In the kernel we can subtract 1 to generate the subsequent mask.
+ */
+ int simd_4xn_diag_size;
+ const real simdFalse = -1, simdTrue = 1;
+ real *simd_interaction_array;
+
+ simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
+ snew_aligned(nbat->simd_4xn_diagonal_j_minus_i, simd_4xn_diag_size, NBNXN_MEM_ALIGN);
+ for (j = 0; j < simd_4xn_diag_size; j++)
+ {
+ nbat->simd_4xn_diagonal_j_minus_i[j] = j - 0.5;
+ }
+
+ snew_aligned(nbat->simd_2xnn_diagonal_j_minus_i, simd_width, NBNXN_MEM_ALIGN);
+ for (j = 0; j < simd_width/2; j++)
+ {
+ /* The j-cluster size is half the SIMD width */
+ nbat->simd_2xnn_diagonal_j_minus_i[j] = j - 0.5;
+ /* The next half of the SIMD width is for i + 1 */
+ nbat->simd_2xnn_diagonal_j_minus_i[simd_width/2+j] = j - 1 - 0.5;
+ }
+
+ /* We use up to 32 bits for exclusion masking.
+ * The same masks are used for the 4xN and 2x(N+N) kernels.
+ * The masks are read either into epi32 SIMD registers or into
+ * real SIMD registers (together with a cast).
+ * In single precision this means the real and epi32 SIMD registers
+ * are of equal size.
+ * In double precision the epi32 registers can be smaller than
+ * the real registers, so depending on the architecture, we might
+ * need to use two, identical, 32-bit masks per real.
+ */
+ simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width;
+ snew_aligned(nbat->simd_exclusion_filter1, simd_excl_size, NBNXN_MEM_ALIGN);
+ snew_aligned(nbat->simd_exclusion_filter2, simd_excl_size*2, NBNXN_MEM_ALIGN);
+
+ for (j = 0; j < simd_excl_size; j++)
+ {
+ /* Set the consecutive bits for masking pair exclusions */
+ nbat->simd_exclusion_filter1[j] = (1U << j);
+ nbat->simd_exclusion_filter2[j*2 + 0] = (1U << j);
+ nbat->simd_exclusion_filter2[j*2 + 1] = (1U << j);
+ }
+
+#if (defined GMX_CPU_ACCELERATION_IBM_QPX)
+ /* The QPX kernels shouldn't do the bit masking that is done on
+ * x86, because the SIMD units lack bit-wise operations. Instead,
+ * we generate a vector of all 2^4 possible ways an i atom
+ * interacts with its 4 j atoms. Each array entry contains
+ * simd_width signed ints that are read in a single SIMD
+ * load. These ints must contain values that will be interpreted
+ * as true and false when loaded in the SIMD floating-point
+ * registers, ie. any positive or any negative value,
+ * respectively. Each array entry encodes how this i atom will
+ * interact with the 4 j atoms. Matching code exists in
+ * set_ci_top_excls() to generate indices into this array. Those
+ * indices are used in the kernels. */
+
+ simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
+ const int qpx_simd_width = GMX_SIMD_WIDTH_HERE;
+ snew_aligned(simd_interaction_array, simd_excl_size * qpx_simd_width, NBNXN_MEM_ALIGN);
+ for (j = 0; j < simd_excl_size; j++)
+ {
+ int index = j * qpx_simd_width;
+ for (i = 0; i < qpx_simd_width; i++)
+ {
+ simd_interaction_array[index + i] = (j & (1 << i)) ? simdTrue : simdFalse;
+ }
+ }
+ nbat->simd_interaction_array = simd_interaction_array;
+#endif
+}
+#endif
+
/* Initializes an nbnxn_atomdata_t data structure */
void nbnxn_atomdata_init(FILE *fp,
nbnxn_atomdata_t *nbat,
#ifdef GMX_NBNXN_SIMD
if (simple)
{
- /* Set the diagonal cluster pair interaction mask setup data.
- * In the kernel we check 0 < j - i to generate the masks.
- * Here we store j - i for generating the mask for the first i (i=0);
- * we substract 0.5 to avoid rounding issues.
- * In the kernel we can subtract 1 to generate the mask for the next i.
- */
- const int simd_width = GMX_SIMD_WIDTH_HERE;
- int simd_4xn_diag_ind_size, simd_interaction_size, j;
-
- simd_4xn_diag_ind_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
- snew_aligned(nbat->simd_4xn_diagonal_j_minus_i,
- simd_4xn_diag_ind_size, NBNXN_MEM_ALIGN);
- for (j = 0; j < simd_4xn_diag_ind_size; j++)
- {
- nbat->simd_4xn_diagonal_j_minus_i[j] = j - 0.5;
- }
-
- snew_aligned(nbat->simd_2xnn_diagonal_j_minus_i,
- simd_width, NBNXN_MEM_ALIGN);
- for (j = 0; j < simd_width/2; j++)
- {
- /* The j-cluster size is half the SIMD width */
- nbat->simd_2xnn_diagonal_j_minus_i[j] = j - 0.5;
- /* The next half of the SIMD width is for i + 1 */
- nbat->simd_2xnn_diagonal_j_minus_i[simd_width/2+j] = j - 1 - 0.5;
- }
-
- /* We use up to 32 bits for exclusion masking.
- * The same masks are used for the 4xN and 2x(N+N) kernels.
- * The masks are read either into epi32 SIMD registers or into
- * real SIMD registers (together with a cast).
- * In single precision this means the real and epi32 SIMD registers
- * are of equal size.
- * In double precision the epi32 registers can be smaller than
- * the real registers, so depending on the architecture, we might
- * need to use two, identical, 32-bit masks per real.
- */
- simd_interaction_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width;
- snew_aligned(nbat->simd_exclusion_filter1, simd_interaction_size, NBNXN_MEM_ALIGN);
- snew_aligned(nbat->simd_exclusion_filter2, simd_interaction_size*2, NBNXN_MEM_ALIGN);
-
- for (j = 0; j < simd_interaction_size; j++)
- {
- /* Set the consecutive bits for filters pair exclusions masks */
- nbat->simd_exclusion_filter1[j] = (1U << j);
- nbat->simd_exclusion_filter2[j*2 + 0] = (1U << j);
- nbat->simd_exclusion_filter2[j*2 + 1] = (1U << j);
- }
+ nbnxn_atomdata_init_simple_exclusion_masks(nbat);
}
#endif
bTMPIAtomics = false;
#endif
-#if defined(i386) || defined(__x86_64__)
+#ifdef GMX_IS_X86
bX86 = true;
#else
bX86 = false;
static const int nbfp_stride = GMX_SIMD_WIDTH_HERE;
#endif
+/* We use the FDV0 table layout when we can use aligned table loads */
+#if GMX_SIMD_WIDTH_HERE == 4
+#define TAB_FDV0
+#endif
+
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+#include "nbnxn_kernel_simd_utils_ibm_qpx.h"
+#endif /* GMX_CPU_ACCELERATION_IBM_QPX */
+
#endif /* GMX_X86_SSE2 */
#endif /* GMX_SIMD_REFERENCE_PLAIN_C */
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef _nbnxn_kernel_simd_utils_ibm_qpx_h_
+#define _nbnxn_kernel_simd_utils_ibm_qpx_h_
+
+typedef gmx_mm_pr gmx_exclfilter;
+static const int filter_stride = 1;
+
+/* The 4xn kernel operates on 4-wide i-force registers */
+typedef gmx_mm_pr gmx_mm_pr4;
+
+/* This files contains all functions/macros for the SIMD kernels
+ * which have explicit dependencies on the j-cluster size and/or SIMD-width.
+ * The functionality which depends on the j-cluster size is:
+ * LJ-parameter lookup
+ * force table lookup
+ * energy group pair energy storage
+ */
+
+/* Collect all [0123] elements of the 4 inputs to out[0123], respectively */
+static gmx_inline void
+gmx_transpose_4_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d,
+ gmx_mm_pr *out0, gmx_mm_pr *out1,
+ gmx_mm_pr *out2, gmx_mm_pr *out3)
+{
+ /* Prepare control vectors for swizzling. In its third input,
+ vec_perm accepts indices into the effective 8-wide SIMD vector
+ created by concatenating its first two inputs. Those indices
+ map data from the input vectors to the output vector.
+
+ vec_gpci() converts an octal literal of the indices into the
+ correct form for vec_perm() to use. That form is an octal digit
+ in bits 0-2 of the mantissa of each double. */
+ gmx_mm_pr p6420 = vec_gpci(06420);
+ gmx_mm_pr p7531 = vec_gpci(07531);
+
+ /* Four-way swizzle (i.e. transpose) of vectors a = a0a1a2a3, etc. */
+ gmx_mm_pr b2b0a2a0 = vec_perm(a, b, p6420);
+ gmx_mm_pr b3b1a3a1 = vec_perm(a, b, p7531);
+ gmx_mm_pr d2d0c2c0 = vec_perm(c, d, p6420);
+ gmx_mm_pr d3d1c3c1 = vec_perm(c, d, p7531);
+ *out0 = vec_perm(d2d0c2c0, b2b0a2a0, p7531);
+ *out1 = vec_perm(d3d1c3c1, b3b1a3a1, p7531);
+ *out2 = vec_perm(d2d0c2c0, b2b0a2a0, p6420);
+ *out3 = vec_perm(d3d1c3c1, b3b1a3a1, p6420);
+}
+
+/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
+static gmx_inline void
+gmx_shuffle_4_ps_fil01_to_2_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d,
+ gmx_mm_pr *out0, gmx_mm_pr *out1)
+{
+ gmx_mm_pr p6420 = vec_gpci(06420);
+ gmx_mm_pr p7531 = vec_gpci(07531);
+
+ /* Partial four-way swizzle of vectors a = a0a1a2a3, etc. */
+ gmx_mm_pr b2b0a2a0 = vec_perm(a, b, p6420);
+ gmx_mm_pr b3b1a3a1 = vec_perm(a, b, p7531);
+ gmx_mm_pr d2d0c2c0 = vec_perm(c, d, p6420);
+ gmx_mm_pr d3d1c3c1 = vec_perm(c, d, p7531);
+ *out0 = vec_perm(d2d0c2c0, b2b0a2a0, p7531);
+ *out1 = vec_perm(d3d1c3c1, b3b1a3a1, p7531);
+}
+
+/* Collect element 2 of the 4 inputs to out */
+static gmx_inline gmx_mm_pr
+gmx_shuffle_4_ps_fil2_to_1_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d)
+{
+ gmx_mm_pr p6420 = vec_gpci(06420);
+
+ /* Partial four-way swizzle of vectors a = a0a1a2a3, etc. */
+ gmx_mm_pr b2b0a2a0 = vec_perm(a, b, p6420);
+ gmx_mm_pr d2d0c2c0 = vec_perm(c, d, p6420);
+ return vec_perm(d2d0c2c0, b2b0a2a0, p6420);
+}
+
+#ifdef TAB_FDV0
+/* Align a stack-based thread-local working array. Table loads on QPX
+ * use the array, but most other implementations do not. */
+static gmx_inline int *
+prepare_table_load_buffer(const int *array)
+{
+ return gmx_simd_align_int(array);
+}
+
+static gmx_inline void
+load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti,
+ gmx_mm_pr *ctab0_S, gmx_mm_pr *ctab1_S)
+{
+#ifdef NDEBUG
+ /* Just like 256-bit AVX, we need to use memory to get indices
+ into integer registers efficiently. */
+ vec_st(ti_S, 0, ti);
+#else
+ vec_sta(ti_S, 0, ti);
+#endif
+
+ /* Here we load 4 aligned reals, but we need just 2 elements of each */
+ gmx_mm_pr a = gmx_load_pr(tab_coul_FDV0 + ti[0] * nbfp_stride);
+ gmx_mm_pr b = gmx_load_pr(tab_coul_FDV0 + ti[1] * nbfp_stride);
+ gmx_mm_pr c = gmx_load_pr(tab_coul_FDV0 + ti[2] * nbfp_stride);
+ gmx_mm_pr d = gmx_load_pr(tab_coul_FDV0 + ti[3] * nbfp_stride);
+
+ gmx_shuffle_4_ps_fil01_to_2_ps(a, b, c, d, ctab0_S, ctab1_S);
+}
+
+static gmx_inline void
+load_table_f_v(const real *tab_coul_FDV0,
+ gmx_epi32 ti_S, int *ti,
+ gmx_mm_pr *ctab0_S, gmx_mm_pr *ctab1_S,
+ gmx_mm_pr *ctabv_S)
+{
+#ifdef NDEBUG
+ /* Just like 256-bit AVX, we need to use memory to get indices
+ into integer registers efficiently. */
+ vec_st(ti_S, 0, ti);
+#else
+ vec_sta(ti_S, 0, ti);
+#endif
+
+ /* Here we load 4 aligned reals, but we need just 3 elements of each. */
+ gmx_mm_pr a = gmx_load_pr(tab_coul_FDV0 + ti[0] * nbfp_stride);
+ gmx_mm_pr b = gmx_load_pr(tab_coul_FDV0 + ti[1] * nbfp_stride);
+ gmx_mm_pr c = gmx_load_pr(tab_coul_FDV0 + ti[2] * nbfp_stride);
+ gmx_mm_pr d = gmx_load_pr(tab_coul_FDV0 + ti[3] * nbfp_stride);
+
+ gmx_shuffle_4_ps_fil01_to_2_ps(a, b, c, d, ctab0_S, ctab1_S);
+ *ctabv_S = gmx_shuffle_4_ps_fil2_to_1_ps(a, b, c, d);
+}
+#else
+
+/* Not required for BlueGene/Q */
+
+#endif
+
+/* Sum the elements within each input register and store the sums in out.
+ */
+static gmx_inline gmx_mm_pr
+gmx_mm_transpose_sum4_pr(gmx_mm_pr a, gmx_mm_pr b,
+ gmx_mm_pr c, gmx_mm_pr d)
+{
+ gmx_mm_pr a0b0c0d0, a1b1c1d1, a2b2c2d2, a3b3c3d3;
+ gmx_transpose_4_ps(a, b, c, d,
+ &a0b0c0d0,
+ &a1b1c1d1,
+ &a2b2c2d2,
+ &a3b3c3d3);
+ /* Now reduce the transposed vectors */
+ gmx_mm_pr sum01 = gmx_add_pr(a0b0c0d0, a1b1c1d1);
+ gmx_mm_pr sim23 = gmx_add_pr(a2b2c2d2, a3b3c3d3);
+ return gmx_add_pr(sum01, sim23);
+}
+
+#ifdef GMX_DOUBLE
+/* In double precision on x86 it can be faster to first calculate
+ * single precision square roots for two double precision registers at
+ * once and then use double precision Newton-Raphson iteration to
+ * reach full double precision. For QPX, we just wrap the usual
+ * reciprocal square roots.
+ */
+static gmx_inline void
+gmx_mm_invsqrt2_pd(gmx_mm_pr in0, gmx_mm_pr in1,
+ gmx_mm_pr *out0, gmx_mm_pr *out1)
+{
+ *out0 = gmx_invsqrt_pr(in0);
+ *out1 = gmx_invsqrt_pr(in1);
+}
+#endif
+
+static gmx_inline void
+load_lj_pair_params(const real *nbfp, const int *type, int aj,
+ gmx_mm_pr *c6_S, gmx_mm_pr *c12_S)
+{
+ /* Here we load 4 aligned reals, but we need just 2 elemnts of each. */
+ gmx_mm_pr a = gmx_load_pr(nbfp + type[aj+0] * nbfp_stride);
+ gmx_mm_pr b = gmx_load_pr(nbfp + type[aj+1] * nbfp_stride);
+ gmx_mm_pr c = gmx_load_pr(nbfp + type[aj+2] * nbfp_stride);
+ gmx_mm_pr d = gmx_load_pr(nbfp + type[aj+3] * nbfp_stride);
+
+ gmx_shuffle_4_ps_fil01_to_2_ps(a, b, c, d, c6_S, c12_S);
+}
+
+/* Define USE_FUNCTIONS_FOR_QPX to get the static inline functions
+ * that seem to exhaust xlC 12.1 during kernel compilation */
+
+#ifndef USE_FUNCTIONS_FOR_QPX
+
+#define gmx_load_exclusion_filter(a) vec_ldia(0, (int *) a)
+#define gmx_load_interaction_mask_pb(a, b) vec_ld(a, (real *) b)
+
+#else /* USE_FUNCTIONS_FOR_QPX */
+
+static gmx_inline gmx_exclfilter gmx_load_exclusion_filter(const unsigned *a)
+{
+#ifdef NDEBUG
+ return vec_ldia(0, (int *) a);
+#else
+ return vec_ldiaa(0, (int *) a);
+#endif
+}
+
+/* Code for handling loading and applying exclusion masks. Note that
+ parameter a is not treated like an array index; it is naively added
+ to b, so should be in bytes. */
+static gmx_inline gmx_mm_pb gmx_load_interaction_mask_pb(long a, const real *b)
+{
+#ifdef NDEBUG
+ return vec_ld(a, (real *) b);
+#else
+ return vec_lda(a, (real *) b);
+#endif
+}
+
+#endif /* USE_FUNCTIONS_FOR_QPX */
+
+#endif /* _nbnxn_kernel_simd_utils_ibm_qpx_h_ */
#ifndef _nbnxn_kernel_simd_utils_ref_h_
#define _nbnxn_kernel_simd_utils_ref_h_
-typedef gmx_simd_ref_epi32 gmx_simd_ref_exclfilter;
-#define gmx_exclfilter gmx_simd_ref_exclfilter
+typedef gmx_simd_ref_epi32 gmx_simd_ref_exclfilter;
+typedef gmx_simd_ref_exclfilter gmx_exclfilter;
static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
+/* Set the stride for the lookup of the two LJ parameters from their
+ (padded) array. Only strides of 2 and 4 are currently supported. */
+#if defined GMX_NBNXN_SIMD_2XNN
+static const int nbfp_stride = 4;
+#elif defined GMX_DOUBLE
+static const int nbfp_stride = 2;
+#else
+static const int nbfp_stride = 4;
+#endif
+
#if GMX_SIMD_WIDTH_HERE > 4
/* The 4xn kernel operates on 4-wide i-force registers */
return c;
}
+#else
+
+typedef gmx_simd_ref_pr gmx_simd_ref_pr4;
+
#endif
* energy group pair energy storage
*/
-#define gmx_exclfilter gmx_epi32
+typedef gmx_epi32 gmx_exclfilter;
static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
/* Transpose 2 double precision registers */
* energy group pair energy storage
*/
-#define gmx_exclfilter gmx_epi32
+typedef gmx_epi32 gmx_exclfilter;
static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
_MM_TRANSPOSE4_PS(in0, in1, in2, in3);
in0 = _mm_add_ps(in0, in1);
in2 = _mm_add_ps(in2, in3);
-
+
return _mm_add_ps(in0, in2);
}
{
__m128 clj_S[UNROLLJ];
int p;
-
+
for (p = 0; p < UNROLLJ; p++)
{
/* Here we load 4 aligned floats, but we need just 2 */
* energy group pair energy storage
*/
-#define gmx_exclfilter gmx_mm_pr
+typedef gmx_mm_pr gmx_exclfilter;
static const int filter_stride = 2;
/* Transpose 2 double precision registers */
in0 = _mm256_hadd_pd(in0, in1);
in2 = _mm256_hadd_pd(in2, in3);
- return _mm256_add_pd(_mm256_permute2f128_pd(in0, in2, 0x20), _mm256_permute2f128_pd(in0, in2, 0x31));
+ return _mm256_add_pd(_mm256_permute2f128_pd(in0, in2, 0x20), _mm256_permute2f128_pd(in0, in2, 0x31));
}
static gmx_inline __m256
__m256 s, ir;
__m256d lu0, lu1;
- s = gmx_2_m128_to_m256(_mm256_cvtpd_ps(in0), _mm256_cvtpd_ps(in1));
+ s = gmx_2_m128_to_m256(_mm256_cvtpd_ps(in0), _mm256_cvtpd_ps(in1));
ir = gmx_mm256_invsqrt_ps_single(s);
lu0 = _mm256_cvtps_pd(_mm256_castps256_ps128(ir));
lu1 = _mm256_cvtps_pd(_mm256_extractf128_ps(ir, 1));
* energy group pair energy storage
*/
-#define gmx_exclfilter gmx_mm_pr
+typedef gmx_mm_pr gmx_exclfilter;
static const int filter_stride = 1;
/* The 4xn kernel operates on 4-wide i-force registers */
#include "maths.h"
#endif
+#ifndef GMX_SIMD_J_UNROLL_SIZE
+#error "Need to define GMX_SIMD_J_UNROLL_SIZE before including the 2xnn kernel common header file"
+#endif
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
ajz = ajy + STRIDE;
#ifdef CHECK_EXCLS
- gmx_load_simd_2xnn_interactions(l_cj[cjind].excl, filter_S0, filter_S2, &interact_S0, &interact_S2);
+ gmx_load_simd_2xnn_interactions(l_cj[cjind].excl,
+ filter_S0, filter_S2,
+ &interact_S0, &interact_S2);
#endif /* CHECK_EXCLS */
/* load j atom coordinates */
#endif /* CALC_ENERGIES */
#ifdef CALC_LJ
- fscal_S0 = gmx_mul_pr(rinvsq_S0,
#ifdef CALC_COULOMB
+ fscal_S0 = gmx_mul_pr(rinvsq_S0,
gmx_add_pr(frcoul_S0,
+ gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
#else
+ fscal_S0 = gmx_mul_pr(rinvsq_S0,
(
-#endif
gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+#endif
#else
fscal_S0 = gmx_mul_pr(rinvsq_S0, frcoul_S0);
#endif /* CALC_LJ */
#if defined CALC_LJ && !defined HALF_LJ
- fscal_S2 = gmx_mul_pr(rinvsq_S2,
#ifdef CALC_COULOMB
+ fscal_S2 = gmx_mul_pr(rinvsq_S2,
gmx_add_pr(frcoul_S2,
+ gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
#else
+ fscal_S2 = gmx_mul_pr(rinvsq_S2,
(
-#endif
gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+#endif
#else
/* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
fscal_S2 = gmx_mul_pr(rinvsq_S2, frcoul_S2);
#include "maths.h"
#endif
+#ifndef GMX_SIMD_J_UNROLL_SIZE
+#error "Need to define GMX_SIMD_J_UNROLL_SIZE before including the 4xn kernel common header file"
+#endif
+
#define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
#define UNROLLI NBNXN_CPU_CLUSTER_I_SIZE
gmx_exclfilter filter_S1,
gmx_exclfilter filter_S2,
gmx_exclfilter filter_S3,
+ const char *interaction_mask_indices,
+ real *simd_interaction_array,
gmx_mm_pb *interact_S0,
gmx_mm_pb *interact_S1,
gmx_mm_pb *interact_S2,
gmx_mm_pb *interact_S3)
{
+#ifdef GMX_X86_SSE2
/* Load integer interaction mask */
gmx_exclfilter mask_pr_S = gmx_load1_exclfilter(excl);
*interact_S0 = gmx_checkbitmask_pb(mask_pr_S, filter_S0);
*interact_S1 = gmx_checkbitmask_pb(mask_pr_S, filter_S1);
*interact_S2 = gmx_checkbitmask_pb(mask_pr_S, filter_S2);
*interact_S3 = gmx_checkbitmask_pb(mask_pr_S, filter_S3);
+#endif
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+ const int size = GMX_SIMD_WIDTH_HERE * sizeof(real);
+ *interact_S0 = gmx_load_interaction_mask_pb(size*interaction_mask_indices[0], simd_interaction_array);
+ *interact_S1 = gmx_load_interaction_mask_pb(size*interaction_mask_indices[1], simd_interaction_array);
+ *interact_S2 = gmx_load_interaction_mask_pb(size*interaction_mask_indices[2], simd_interaction_array);
+ *interact_S3 = gmx_load_interaction_mask_pb(size*interaction_mask_indices[3], simd_interaction_array);
+#endif
}
/* All functionality defines are set here, except for:
ajz = ajy + STRIDE;
#ifdef CHECK_EXCLS
- gmx_load_simd_4xn_interactions(l_cj[cjind].excl, filter_S0, filter_S1, filter_S2, filter_S3, &interact_S0, &interact_S1, &interact_S2, &interact_S3);
+ gmx_load_simd_4xn_interactions(l_cj[cjind].excl,
+ filter_S0, filter_S1,
+ filter_S2, filter_S3,
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+ l_cj[cjind].interaction_mask_indices,
+ nbat->simd_interaction_array,
+#else
+ /* The struct fields do not exist
+ except on BlueGene/Q */
+ NULL,
+ NULL,
+#endif
+ &interact_S0, &interact_S1,
+ &interact_S2, &interact_S3);
#endif /* CHECK_EXCLS */
/* load j atom coordinates */
#endif
#endif
#ifndef ENERGY_GROUPS
- Vvdwtot_S = gmx_add_pr(Vvdwtot_S,
#ifndef HALF_LJ
+ Vvdwtot_S = gmx_add_pr(Vvdwtot_S,
gmx_sum4_pr(VLJ_S0, VLJ_S1, VLJ_S2, VLJ_S3)
+ );
#else
+ Vvdwtot_S = gmx_add_pr(Vvdwtot_S,
gmx_add_pr(VLJ_S0, VLJ_S1)
-#endif
);
+#endif
#else
add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj);
add_ener_grp(VLJ_S1, vvdwtp[1], egp_jj);
#endif /* CALC_ENERGIES */
#ifdef CALC_LJ
- fscal_S0 = gmx_mul_pr(rinvsq_S0,
#ifdef CALC_COULOMB
+ fscal_S0 = gmx_mul_pr(rinvsq_S0,
gmx_add_pr(frcoul_S0,
+ gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
#else
+ fscal_S0 = gmx_mul_pr(rinvsq_S0,
(
-#endif
gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
- fscal_S1 = gmx_mul_pr(rinvsq_S1,
+#endif
#ifdef CALC_COULOMB
+ fscal_S1 = gmx_mul_pr(rinvsq_S1,
gmx_add_pr(frcoul_S1,
+ gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
#else
+ fscal_S1 = gmx_mul_pr(rinvsq_S1,
(
-#endif
gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
+#endif
#else
fscal_S0 = gmx_mul_pr(rinvsq_S0, frcoul_S0);
fscal_S1 = gmx_mul_pr(rinvsq_S1, frcoul_S1);
#endif /* CALC_LJ */
#if defined CALC_LJ && !defined HALF_LJ
- fscal_S2 = gmx_mul_pr(rinvsq_S2,
#ifdef CALC_COULOMB
+ fscal_S2 = gmx_mul_pr(rinvsq_S2,
gmx_add_pr(frcoul_S2,
+ gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
#else
+ fscal_S2 = gmx_mul_pr(rinvsq_S2,
(
-#endif
gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
- fscal_S3 = gmx_mul_pr(rinvsq_S3,
+#endif
#ifdef CALC_COULOMB
+ fscal_S3 = gmx_mul_pr(rinvsq_S3,
gmx_add_pr(frcoul_S3,
+ gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
#else
+ fscal_S3 = gmx_mul_pr(rinvsq_S3,
(
-#endif
gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
+#endif
#else
/* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
fscal_S2 = gmx_mul_pr(rinvsq_S2, frcoul_S2);
#if UNROLLJ >= 4
/* We use an i-force SIMD register width of 4 */
#if UNROLLJ == 4
-#define gmx_mm_pr4 gmx_mm_pr
-#define gmx_load_pr4 gmx_load_pr
-#define gmx_store_pr4 gmx_store_pr
-#define gmx_add_pr4 gmx_add_pr
+#define gmx_mm_pr4 gmx_mm_pr
+#define gmx_load_pr4 gmx_load_pr
+#define gmx_store_pr4 gmx_store_pr
+#define gmx_add_pr4 gmx_add_pr
#else
/* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
#endif
unsigned *exclusion_filter;
gmx_exclfilter filter_S0, filter_S1, filter_S2, filter_S3;
- gmx_mm_pr zero_S = gmx_set1_pr(0);
+ gmx_mm_pr zero_S = gmx_set1_pr(0.0);
- gmx_mm_pr one_S = gmx_set1_pr(1.0);
- gmx_mm_pr iq_S0 = gmx_setzero_pr();
- gmx_mm_pr iq_S1 = gmx_setzero_pr();
- gmx_mm_pr iq_S2 = gmx_setzero_pr();
- gmx_mm_pr iq_S3 = gmx_setzero_pr();
- gmx_mm_pr mrc_3_S;
+ gmx_mm_pr one_S = gmx_set1_pr(1.0);
+ gmx_mm_pr iq_S0 = gmx_setzero_pr();
+ gmx_mm_pr iq_S1 = gmx_setzero_pr();
+ gmx_mm_pr iq_S2 = gmx_setzero_pr();
+ gmx_mm_pr iq_S3 = gmx_setzero_pr();
+ gmx_mm_pr mrc_3_S;
#ifdef CALC_ENERGIES
- gmx_mm_pr hrc_3_S, moh_rc_S;
+ gmx_mm_pr hrc_3_S, moh_rc_S;
#endif
#ifdef CALC_COUL_TAB
* so we don't need to treat special cases in the rest of the code.
*/
#ifdef NBNXN_SEARCH_BB_SIMD4
- gmx_simd4_store_pr(&bbj[1].lower[0], gmx_simd4_load_pr(&bbj[0].lower[0]));
- gmx_simd4_store_pr(&bbj[1].upper[0], gmx_simd4_load_pr(&bbj[0].upper[0]));
+ gmx_simd4_store_pr(&bbj[1].lower[0], gmx_simd4_load_bb_pr(&bbj[0].lower[0]));
+ gmx_simd4_store_pr(&bbj[1].upper[0], gmx_simd4_load_bb_pr(&bbj[0].upper[0]));
#else
bbj[1] = bbj[0];
#endif
#ifdef NBNXN_SEARCH_BB_SIMD4
gmx_simd4_store_pr(&bb->lower[0],
- gmx_simd4_min_pr(gmx_simd4_load_pr(&bbj[0].lower[0]),
- gmx_simd4_load_pr(&bbj[1].lower[0])));
+ gmx_simd4_min_pr(gmx_simd4_load_bb_pr(&bbj[0].lower[0]),
+ gmx_simd4_load_bb_pr(&bbj[1].lower[0])));
gmx_simd4_store_pr(&bb->upper[0],
- gmx_simd4_max_pr(gmx_simd4_load_pr(&bbj[0].upper[0]),
- gmx_simd4_load_pr(&bbj[1].upper[0])));
+ gmx_simd4_max_pr(gmx_simd4_load_bb_pr(&bbj[0].upper[0]),
+ gmx_simd4_load_bb_pr(&bbj[1].upper[0])));
#else
{
int i;
int i;
- bb_0_S = gmx_simd4_load_pr(x);
+ bb_0_S = gmx_simd4_load_bb_pr(x);
bb_1_S = bb_0_S;
for (i = 1; i < na; i++)
{
- x_S = gmx_simd4_load_pr(x+i*NNBSBB_C);
+ x_S = gmx_simd4_load_bb_pr(x+i*NNBSBB_C);
bb_0_S = gmx_simd4_min_pr(bb_0_S, x_S);
bb_1_S = gmx_simd4_max_pr(bb_1_S, x_S);
}
#ifdef NBNXN_SEARCH_BB_SIMD4
gmx_simd4_pr min_S, max_S;
- min_S = gmx_simd4_min_pr(gmx_simd4_load_pr(&bb[c2*2+0].lower[0]),
- gmx_simd4_load_pr(&bb[c2*2+1].lower[0]));
- max_S = gmx_simd4_max_pr(gmx_simd4_load_pr(&bb[c2*2+0].upper[0]),
- gmx_simd4_load_pr(&bb[c2*2+1].upper[0]));
+ min_S = gmx_simd4_min_pr(gmx_simd4_load_bb_pr(&bb[c2*2+0].lower[0]),
+ gmx_simd4_load_bb_pr(&bb[c2*2+1].lower[0]));
+ max_S = gmx_simd4_max_pr(gmx_simd4_load_bb_pr(&bb[c2*2+0].upper[0]),
+ gmx_simd4_load_bb_pr(&bb[c2*2+1].upper[0]));
gmx_simd4_store_pr(&grid->bbj[c2].lower[0], min_S);
gmx_simd4_store_pr(&grid->bbj[c2].upper[0], max_S);
#else
gmx_simd4_pr dm_S;
gmx_simd4_pr dm0_S;
- bb_i_S0 = gmx_simd4_load_pr(&bb_i_ci[si].lower[0]);
- bb_i_S1 = gmx_simd4_load_pr(&bb_i_ci[si].upper[0]);
- bb_j_S0 = gmx_simd4_load_pr(&bb_j_all[csj].lower[0]);
- bb_j_S1 = gmx_simd4_load_pr(&bb_j_all[csj].upper[0]);
+ bb_i_S0 = gmx_simd4_load_bb_pr(&bb_i_ci[si].lower[0]);
+ bb_i_S1 = gmx_simd4_load_bb_pr(&bb_i_ci[si].upper[0]);
+ bb_j_S0 = gmx_simd4_load_bb_pr(&bb_j_all[csj].lower[0]);
+ bb_j_S1 = gmx_simd4_load_bb_pr(&bb_j_all[csj].upper[0]);
dl_S = gmx_simd4_sub_pr(bb_i_S0, bb_j_S1);
dh_S = gmx_simd4_sub_pr(bb_j_S0, bb_i_S1);
\
shi = si*NNBSBB_D*DIM; \
\
- xi_l = gmx_simd4_load_pr(bb_i+shi+0*STRIDE_PBB); \
- yi_l = gmx_simd4_load_pr(bb_i+shi+1*STRIDE_PBB); \
- zi_l = gmx_simd4_load_pr(bb_i+shi+2*STRIDE_PBB); \
- xi_h = gmx_simd4_load_pr(bb_i+shi+3*STRIDE_PBB); \
- yi_h = gmx_simd4_load_pr(bb_i+shi+4*STRIDE_PBB); \
- zi_h = gmx_simd4_load_pr(bb_i+shi+5*STRIDE_PBB); \
+ xi_l = gmx_simd4_load_bb_pr(bb_i+shi+0*STRIDE_PBB); \
+ yi_l = gmx_simd4_load_bb_pr(bb_i+shi+1*STRIDE_PBB); \
+ zi_l = gmx_simd4_load_bb_pr(bb_i+shi+2*STRIDE_PBB); \
+ xi_h = gmx_simd4_load_bb_pr(bb_i+shi+3*STRIDE_PBB); \
+ yi_h = gmx_simd4_load_bb_pr(bb_i+shi+4*STRIDE_PBB); \
+ zi_h = gmx_simd4_load_bb_pr(bb_i+shi+5*STRIDE_PBB); \
\
dx_0 = gmx_simd4_sub_pr(xi_l, xj_h); \
dy_0 = gmx_simd4_sub_pr(yi_l, yj_h); \
rc2_S = gmx_simd4_set1_pr(rl2);
dim_stride = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB*DIM;
- ix_S0 = gmx_simd4_load_pr(x_i+(si*dim_stride+0)*STRIDE_PBB);
- iy_S0 = gmx_simd4_load_pr(x_i+(si*dim_stride+1)*STRIDE_PBB);
- iz_S0 = gmx_simd4_load_pr(x_i+(si*dim_stride+2)*STRIDE_PBB);
- ix_S1 = gmx_simd4_load_pr(x_i+(si*dim_stride+3)*STRIDE_PBB);
- iy_S1 = gmx_simd4_load_pr(x_i+(si*dim_stride+4)*STRIDE_PBB);
- iz_S1 = gmx_simd4_load_pr(x_i+(si*dim_stride+5)*STRIDE_PBB);
+ ix_S0 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+0)*STRIDE_PBB);
+ iy_S0 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+1)*STRIDE_PBB);
+ iz_S0 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+2)*STRIDE_PBB);
+ ix_S1 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+3)*STRIDE_PBB);
+ iy_S1 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+4)*STRIDE_PBB);
+ iz_S1 = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+5)*STRIDE_PBB);
/* We loop from the outer to the inner particles to maximize
* the chance that we find a pair in range quickly and return.
inner_e = ge - (se << na_cj_2log);
nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
+/* The next code line is usually not needed. We do not want to version
+ * away the above line, because there is logic that relies on being
+ * able to detect easily whether any exclusions exist. */
+#if (defined GMX_CPU_ACCELERATION_IBM_QPX)
+ nbl->cj[found].interaction_mask_indices[inner_i] &= ~(1U << inner_e);
+#endif
}
}
}
extern "C" {
#endif
-
/* Returns the j-cluster size for kernel of type nb_kernel_type */
int nbnxn_kernel_to_cj_size(int nb_kernel_type);
/* Store cj and the interaction mask */
nbl->cj[nbl->ncj].cj = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
nbl->cj[nbl->ncj].excl = get_imask_simd_4xn(remove_sub_diag, ci, cj);
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+ nbl->cj[nbl->ncj].interaction_mask_indices[0] = (nbl->cj[nbl->ncj].excl & 0x000F) >> (0 * 4);
+ nbl->cj[nbl->ncj].interaction_mask_indices[1] = (nbl->cj[nbl->ncj].excl & 0x00F0) >> (1 * 4);
+ nbl->cj[nbl->ncj].interaction_mask_indices[2] = (nbl->cj[nbl->ncj].excl & 0x0F00) >> (2 * 4);
+ nbl->cj[nbl->ncj].interaction_mask_indices[3] = (nbl->cj[nbl->ncj].excl & 0xF000) >> (3 * 4);
+#endif
nbl->ncj++;
}
/* Increase the closing index in i super-cell list */
}
#undef STRIDE_S
-