From 25eb0e14db996febfe78195a2e63ee2874ae84f7 Mon Sep 17 00:00:00 2001
From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Mon, 19 Aug 2013 18:33:55 +0200
Subject: [PATCH] BlueGene/Q Verlet cut-off scheme kernels

The kernels are implemented with small functions whose inlining
is guaranteed by the use of xlc and clang extensions. That's a hack
whose general solution I plan to implement in master branch.

Other BG/Q considerations:

Architecture detection now works on A2 core.

Install guide updated.

It is better to use intra-node communicators than not, and ranks
within nodes are correctly detected via querying the BlueGene/Q API,
since the hostname is not useful for the purpose.

It is better to not set GMX_DD_SENDRECV2.

It is better to use the analytical Ewald correction.

In principle, we should version the type of variables and fields named
d2, rl2, rbb2 in nbnxn_search*[ch] to be double on PowerPC and float
everywhere else (each regardless of GROMACS target precision). This
would mean that on PowerPC (where all flops take place in double
precision with free precision-extension upon load) we can be both
cache-efficient by storing bounding boxes in float, and flop-efficient
by not having to generate a round-to-single instruction to compare the
result of subc_bb_dist2_simd4 with the cut-off stored as a
float. Still, a flop per bounding-box distance comparison will not
break the bank.

Enough bgclang support exists for the build to succeed (no platform
file is required), even with OpenMP, but a number of compiler issues
have been reported on llvm-bgq-discuss mailing list.

Change-Id: I98c5791ec3766cdbdcb8a8eb7418d00585727cc0
---
 CMakeLists.txt                                |  65 ++--
 admin/installguide/installguide.tex           |  54 +--
 cmake/Platform/BlueGeneQ-base.cmake           |  10 +
 cmake/TestBlueGeneQ.c                         |   8 +
 cmake/TestX86.c                               |   8 +
 cmake/gmxDetectAcceleration.cmake             |  35 +-
 cmake/gmxDetectTargetArchitecture.cmake       |  12 +
 cmake/gmxGetCompilerInfo.cmake                |  16 +-
 cmake/gmxManageBlueGene.cmake                 |   4 +-
 cmake/gmxSetBuildInformation.cmake            |  14 +-
 cmake/gmxTestInlineASM.cmake                  |   2 -
 include/gmx_simd4_macros.h                    | 164 ++++++++-
 include/gmx_simd_macros.h                     | 334 ++++++++++++++++--
 include/gmx_simd_ref.h                        |  10 +-
 include/network.h                             |   7 +-
 include/types/nb_verlet.h                     |   2 +-
 include/types/nbnxn_pairlist.h                |  26 +-
 src/config.h.cmakein                          |   6 +
 src/gmxlib/gmx_cpuid.c                        |   2 +-
 src/gmxlib/network.c                          |  48 ++-
 src/kernel/runner.c                           |   9 +
 src/mdlib/forcerec.c                          |  15 +-
 src/mdlib/nbnxn_atomdata.c                    | 135 ++++---
 src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  |   2 +-
 .../nbnxn_kernels/nbnxn_kernel_simd_utils.h   |   9 +
 .../nbnxn_kernel_simd_utils_ibm_qpx.h         | 250 +++++++++++++
 .../nbnxn_kernel_simd_utils_ref.h             |  18 +-
 .../nbnxn_kernel_simd_utils_x86_128d.h        |   2 +-
 .../nbnxn_kernel_simd_utils_x86_128s.h        |   6 +-
 .../nbnxn_kernel_simd_utils_x86_256d.h        |   6 +-
 .../nbnxn_kernel_simd_utils_x86_256s.h        |   2 +-
 .../simd_2xnn/nbnxn_kernel_simd_2xnn_common.h |   3 +
 .../simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h  |  16 +-
 .../simd_4xn/nbnxn_kernel_simd_4xn_common.h   |  15 +
 .../simd_4xn/nbnxn_kernel_simd_4xn_inner.h    |  45 ++-
 .../simd_4xn/nbnxn_kernel_simd_4xn_outer.h    |  24 +-
 src/mdlib/nbnxn_search.c                      |  62 ++--
 src/mdlib/nbnxn_search.h                      |   1 -
 src/mdlib/nbnxn_search_simd_4xn.h             |   7 +-
 39 files changed, 1210 insertions(+), 244 deletions(-)
 create mode 100644 cmake/TestBlueGeneQ.c
 create mode 100644 cmake/TestX86.c
 create mode 100644 cmake/gmxDetectTargetArchitecture.cmake
 create mode 100644 src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 82ff762971..febf211677 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -169,19 +169,19 @@ mark_as_advanced(GMX_SOFTWARE_INVSQRT)
 option(GMX_FAHCORE "Build a library with mdrun functionality" OFF)
 mark_as_advanced(GMX_FAHCORE)
 
-include(gmxDetectAcceleration)
-if(NOT DEFINED GMX_CPU_ACCELERATION)
-    if(CMAKE_CROSSCOMPILING)
-        if("${CMAKE_SYSTEM_NAME}" MATCHES "BlueGeneQ")
-            set(GMX_SUGGESTED_CPU_ACCELERATION "IBM_QPX")
-        else()
-            set(GMX_SUGGESTED_CPU_ACCELERATION "None")
-        endif()
-    else(CMAKE_CROSSCOMPILING)
-        gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
-    endif(CMAKE_CROSSCOMPILING)
-endif(NOT DEFINED GMX_CPU_ACCELERATION)
+if(NOT DEFINED GMX_CPU_ACCELERATION AND NOT CMAKE_CROSSCOMPILING)
+    include(gmxDetectAcceleration)
+    gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
+endif()
 
+# Detect the architecture the compiler is targetting, detect
+# acceleration possibilities on that hardware, suggest an acceleration
+# to use if none is specified, and populate the cache option for CPU
+# accleration.
+include(gmxDetectTargetArchitecture)
+gmx_detect_target_architecture()
+include(gmxDetectAcceleration)
+gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
 set(GMX_CPU_ACCELERATION "@GMX_SUGGESTED_CPU_ACCELERATION@"
     CACHE STRING "Accelerated CPU kernels. Pick one of: None, SSE2, SSE4.1, AVX_128_FMA, AVX_256, IBM_QPX, Sparc64_HPC_ACE")
 
@@ -689,7 +689,7 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE2")
     # The user should not be able to set this orthogonally to the acceleration
     set(GMX_X86_SSE2 1)
     if (NOT ACCELERATION_QUIETLY)
-      message(STATUS "Enabling SSE2 Gromacs acceleration, and it will help compiler optimization.")
+      message(STATUS "Enabling SSE2 Gromacs acceleration")
     endif()
 
 elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE4.1")
@@ -736,7 +736,7 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "SSE4.1")
     set(GMX_X86_SSE4_1 1)
     set(GMX_X86_SSE2   1)
     if (NOT ACCELERATION_QUIETLY)
-      message(STATUS "Enabling SSE4.1 Gromacs acceleration, and it will help compiler optimization.")
+      message(STATUS "Enabling SSE4.1 Gromacs acceleration")
     endif()
 
     if(CMAKE_C_COMPILER_ID MATCHES "Intel" AND C_COMPILER_VERSION VERSION_EQUAL "11.1")
@@ -811,7 +811,7 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
         set(GMX_CPU_ACCELERATION_X86_AVX_128_FMA 1)
         set(GMX_X86_AVX_128_FMA 1)
         if (NOT ACCELERATION_QUIETLY)
-          message(STATUS "Enabling 128-bit AVX Gromacs acceleration (with fused-multiply add), and it will help compiler optimization.")
+          message(STATUS "Enabling 128-bit AVX Gromacs acceleration (with fused-multiply add)")
         endif()
 
         # We don't have the full compiler version string yet (BUILD_C_COMPILER),
@@ -832,7 +832,7 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
         set(GMX_CPU_ACCELERATION_X86_AVX_256 1)
         set(GMX_X86_AVX_256 1)
         if (NOT ACCELERATION_QUIETLY)
-          message(STATUS "Enabling 256-bit AVX Gromacs acceleration, and it will help compiler optimization.")
+          message(STATUS "Enabling 256-bit AVX Gromacs acceleration")
         endif()
     endif()
 
@@ -841,31 +841,14 @@ elseif(${GMX_CPU_ACCELERATION} STREQUAL "AVX_128_FMA" OR ${GMX_CPU_ACCELERATION}
     gmx_test_avx_gcc_maskload_bug(${ACCELERATION_C_FLAGS} GMX_X86_AVX_GCC_MASKLOAD_BUG)
 
 elseif(${GMX_CPU_ACCELERATION} STREQUAL "IBM_QPX")
-    # Used on BlueGene/Q
-    if (CMAKE_C_COMPILER_ID MATCHES "XL")
-        GMX_TEST_CFLAG(XLC_BLUEGENEQ_CFLAG "-qarch=qp -qtune=qp" ACCELERATION_C_FLAGS)
-        try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
-            "${CMAKE_SOURCE_DIR}/cmake/TestQPX.c"
-            COMPILE_DEFINITIONS "${ACCELERATION_C_FLAGS}")
-        if(NOT TEST_QPX)
-            message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics.")
-        endif()
-    endif()
-    if (CMAKE_CXX_COMPILER_ID MATCHES "XL" AND CMAKE_CXX_COMPILER_LOADED)
-        GMX_TEST_CXXFLAG(XLC_BLUEGENEQ_CXXFLAG "-qarch=qp -qtune=qp" ACCELERATION_CXX_FLAGS)
-        try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
-            "${CMAKE_SOURCE_DIR}/cmake/TestQPX.c"
-            COMPILE_DEFINITIONS "${ACCELERATION_CXX_FLAGS}")
-        if(NOT TEST_QPX)
-            message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics.")
-        endif()
-    endif()
+    try_compile(TEST_QPX ${CMAKE_BINARY_DIR}
+        "${CMAKE_SOURCE_DIR}/cmake/TestQPX.c")
 
     if (TEST_QPX)
-        message(WARNING "IBM QPX acceleration was selected and could be compiled, but the accelerated kernels are not yet available.")
+        message(WARNING "IBM QPX acceleration was selected. This will work, but SIMD-accelerated kernels are only available for the Verlet cut-off scheme. The plain C kernels that are used for the group cut-off scheme kernels will be slow, so please consider using the Verlet cut-off scheme.")
         set(GMX_CPU_ACCELERATION_IBM_QPX 1)
     else()
-        message(FATAL_ERROR "Cannot compile IBM QPX intrinsics without the XL compiler. If you are compiling for BlueGene/Q, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=BlueGeneQ-static-XL-C' to set up the tool chain.")
+        message(FATAL_ERROR "Cannot compile the requested IBM QPX intrinsics. If you are compiling for BlueGene/Q with the XL compilers, use 'cmake .. -DCMAKE_TOOLCHAIN_FILE=Platform/BlueGeneQ-static-XL-C' to set up the tool chain.")
     endif()
 elseif(${GMX_CPU_ACCELERATION} STREQUAL "SPARC64_HPC_ACE")
     set(GMX_CPU_ACCELERATION_SPARC64_HPC_ACE 1)
@@ -1063,6 +1046,14 @@ mark_as_advanced(GMX_BUILD_MANPAGES)
 if(HAVE_LIBM)
     list(APPEND	GMX_EXTRA_LIBRARIES m)
 endif(HAVE_LIBM)
+if (${CMAKE_SYSTEM_NAME} MATCHES "BlueGene")
+    check_library_exists(mass_simd atan2f4 "" HAVE_MASS_SIMD)
+    if(HAVE_MASS_SIMD)
+        list(APPEND GMX_EXTRA_LIBRARIES mass_simd)
+    else()
+        message(FATAL_ERROR "Could not link to the SIMD version of the IBM MASS library. Please adjust your CMAKE_PREFIX_PATH to contain it")
+    endif()
+endif()
 
 if(GMX_FAHCORE)
   set(COREWRAP_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/../corewrap" CACHE STRING 
diff --git a/admin/installguide/installguide.tex b/admin/installguide/installguide.tex
index 770a8668d8..cfdac1e94b 100644
--- a/admin/installguide/installguide.tex
+++ b/admin/installguide/installguide.tex
@@ -418,7 +418,7 @@ be specified using the following environment variables:
 \end{itemize}
 The respective '\verb+include+', '\verb+lib+', or '\verb+bin+' is
 appended to the path. For each of these variables, a list of paths can
-be specified (on Unix seperated with ":"). Note that these are
+be specified (on Unix separated with ":"). Note that these are
 enviroment variables (and not \cmake{} command-line arguments) and in
 a '\verb+bash+' shell are used like:
 \begin{verbatim}
@@ -447,8 +447,8 @@ is found, and otherwise fall back on a version of \blas{} internal to
 accordingly. The internal versions are fine for normal use. If you
 need to specify a non-standard path to search, use
 \verb+-DCMAKE_PREFIX_PATH=/path/to/search+. If you need to specify a
-library with a non-standard name (e.g. ESSL on AIX), then set
-\verb+-DGMX_BLAS_USER=/path/to/reach/lib/libwhatever.a+.
+library with a non-standard name (e.g. ESSL on AIX or BlueGene), then
+set \verb+-DGMX_BLAS_USER=/path/to/reach/lib/libwhatever.a+.
 
 If you are using Intel's \mkl{} for \fft{}, then the \blas{} and
 \lapack{} it provides are used automatically. This could be
@@ -693,14 +693,14 @@ parallel job execution.
 
 \subsubsection{BlueGene/P}
 
-There is currently no native acceleration on this platform, but the
-default plain C kernels will work.
+There is currently no native acceleration on this platform and no
+plans to make one. The default plain C kernels will work.
 
 \subsubsection{BlueGene/Q}
 
-There is currently no native acceleration on this platform, but the
-default plain C kernels will work. We have accelerated kernels in
-progress for this platform, but they are not quite done yet.
+There is currently native acceleration on this platform for the Verlet
+cut-off scheme. Accelerated kernels for the group cut-off scheme may
+come in the future, but the default plain C kernels will work.
 
 Only static linking with XL compilers is supported by \gromacs{}. Dynamic
 linking would be supported by the architecture and \gromacs{}, but has no
@@ -722,23 +722,35 @@ You need to arrange for FFTW to be installed correctly, following the
 above instructions.
 
 mpicc is used for compiling and linking. This can make it awkward to
-attempt to use IBM's optimized BLAS/LAPACK called ESSL. Since mdrun is
-the only part of \gromacs{} that should normally run on the compute
-nodes, and there is nearly no need for linear algebra support for
-mdrun, it is recommended to use the \gromacs{} built-in linear algebra
-routines - it is rare for this to be a bottleneck.
-
+attempt to use IBM's optimized BLAS/LAPACK called ESSL (see the
+section on linear algebra). Since mdrun is the only part of \gromacs{}
+that should normally run on the compute nodes, and there is nearly no
+need for linear algebra support for mdrun, it is recommended to use
+the \gromacs{} built-in linear algebra routines - it is rare for this
+to be a bottleneck.
+
+The recommended configuration is to use
 \begin{verbatim}
-cmake .. -DCMAKE_TOOLCHAIN_FILE=BlueGeneQ-static-XL-C \
-         -DCMAKE_PREFIX_PATH=/your/fftw/installation/prefix
+cmake .. -DCMAKE_TOOLCHAIN_FILE=Platform/BlueGeneQ-static-XL-CXX \
+         -DCMAKE_PREFIX_PATH=/your/fftw/installation/prefix \
+         -DGMX_MPI=on
 make mdrun
 make install-mdrun
 \end{verbatim}
+which will build a statically-linked MPI-enabled mdrun for the back
+end. Otherwise, GROMACS default configuration behaviour applies.
+
 It is possible to configure and make the remaining \gromacs{} tools
-with the compute node toolchain, but as none of those tools are
-\mpi{}-aware, this would not normally be useful. Instead, these should
-be planned to run on the login node, and a seperate \gromacs{}
-installation performed for that using the login node's toolchain.
+with the compute-node toolchain, but as none of those tools are
+\mpi{}-aware and could then only run on the compute nodes, this
+would not normally be useful. Instead, these should be planned
+to run on the login node, and a separate \gromacs{} installation
+performed for that using the login node's toolchain - not the
+above platform file, or any other compute-node toolchain.
+
+Note that only the MPI build is available for the compute-node
+toolchains. The GROMACS thread-MPI or serial builds are not useful at
+all on BlueGene/Q.
 
 \subsubsection{Fujitsu PRIMEHPC}
 
@@ -757,7 +769,7 @@ repository is currently tested on x86 with gcc versions ranging
 from 4.4 through 4.7, and versions 12 and 13 of the Intel compiler.
 Under Windows we test both the visual studio compilers and icc,
 
-We test irregularly on BlueGene/L, BlueGene/P, BlueGene/Q, Cray, 
+We test irregularly on BlueGene/Q, Cray,
 Fujitsu PRIMEHPC, Google nativeclient and other environments. In 
 the future we expect ARM to be an important test target too, but this
 is currently not included.
diff --git a/cmake/Platform/BlueGeneQ-base.cmake b/cmake/Platform/BlueGeneQ-base.cmake
index dd17ab682b..94a4b01373 100644
--- a/cmake/Platform/BlueGeneQ-base.cmake
+++ b/cmake/Platform/BlueGeneQ-base.cmake
@@ -117,4 +117,14 @@ macro(__BlueGeneQ_set_static_flags compiler_id lang)
     "<FLAGS> <CMAKE_${lang}_LINK_FLAGS> <LINK_FLAGS> <OBJECTS>  -o <TARGET> <LINK_LIBRARIES>")
   set(CMAKE_${lang}_LINK_EXECUTABLE
     "<CMAKE_${lang}_COMPILER> ${BG/Q_${lang}_DEFAULT_EXE_FLAGS}")
+
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug" AND ${compiler_id} STREQUAL "XL")
+      # Work around an unknown compiler bug triggered in
+      # compute_globals(). Using -O0 disables -qhot and this seems
+      # to break the normal OpenMP flag -qsmp unless qualified with
+      # noauto.
+      set(OpenMP_C_FLAGS "-qsmp=noauto" CACHE STRING "Compiler flag for OpenMP parallelization")
+      set(OpenMP_CXX_FLAGS "-qsmp=noauto" CACHE STRING "Compiler flag for OpenMP parallelization")
+  endif()
+
 endmacro()
diff --git a/cmake/TestBlueGeneQ.c b/cmake/TestBlueGeneQ.c
new file mode 100644
index 0000000000..7edc604aeb
--- /dev/null
+++ b/cmake/TestBlueGeneQ.c
@@ -0,0 +1,8 @@
+int main()
+{
+#ifdef __bgq__
+    return 0;
+#else
+#error This compiler is not targetting BlueGene/Q
+#endif
+}
diff --git a/cmake/TestX86.c b/cmake/TestX86.c
new file mode 100644
index 0000000000..0cceef0005
--- /dev/null
+++ b/cmake/TestX86.c
@@ -0,0 +1,8 @@
+int main()
+{
+#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+    return 0;
+#else
+#error This is not x86
+#endif
+}
diff --git a/cmake/gmxDetectAcceleration.cmake b/cmake/gmxDetectAcceleration.cmake
index b51137def7..6b256d69fe 100644
--- a/cmake/gmxDetectAcceleration.cmake
+++ b/cmake/gmxDetectAcceleration.cmake
@@ -34,19 +34,22 @@
 #
 # - Check the username performing the build, as well as date and time
 #
-# GMX_DETECT_ACCELERATION(GMX_SUGGESTED_ACCELERATION)
+# gmx_detect_acceleration(GMX_SUGGESTED_CPU_ACCELERATION)
 #
 # Try to detect CPU information and suggest an acceleration option
-# (such as SSE/AVX) that fits the current CPU.
+# (such as SSE/AVX) that fits the current CPU. These functions assume
+# that gmx_detect_target_architecture() has already been run, so that
+# things like GMX_IS_X86 are already available.
 #
-# GMX_SUGGESTED_ACCELERATION
+# Sets ${GMX_SUGGESTED_CPU_ACCELERATION} in the parent scope if
+# GMX_CPU_ACCELERATION is not set (e.g. by the user, or a previous run
+# of CMake).
 #
 
 # we rely on inline asm support for GNU!
 include(gmxTestInlineASM)
 
-macro(gmx_detect_acceleration GMX_SUGGESTED_ACCELERATION)
-    IF(NOT DEFINED ${GMX_SUGGESTED_ACCELERATION})
+function(gmx_suggest_x86_acceleration _suggested_acceleration)
 
     gmx_test_inline_asm_gcc_x86(GMX_X86_GCC_INLINE_ASM)
 
@@ -62,7 +65,7 @@ macro(gmx_detect_acceleration GMX_SUGGESTED_ACCELERATION)
     try_run(GMX_CPUID_RUN_ACC GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
             ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE -DGMX_IS_X86"
             RUN_OUTPUT_VARIABLE OUTPUT_TMP
             COMPILE_OUTPUT_VARIABLE GMX_CPUID_COMPILE_OUTPUT
             ARGS "-acceleration")
@@ -79,10 +82,20 @@ macro(gmx_detect_acceleration GMX_SUGGESTED_ACCELERATION)
 
     string(STRIP "@OUTPUT_TMP@" OUTPUT_ACC)
 
-    message(STATUS "Detecting best acceleration for this CPU - @OUTPUT_ACC@")
+    set(${_suggested_acceleration} "@OUTPUT_ACC@" PARENT_SCOPE)
+    message(STATUS "Detected best acceleration for this CPU - @OUTPUT_ACC@")
+endfunction()
 
-    set(${GMX_SUGGESTED_ACCELERATION}    "@OUTPUT_ACC@" CACHE INTERNAL "GROMACS CPU-specific acceleration")
-
-    ENDIF(NOT DEFINED ${GMX_SUGGESTED_ACCELERATION})
-endmacro(gmx_detect_acceleration GMX_SUGGESTED_ACCELERATION)
+function(gmx_detect_acceleration _suggested_acceleration)
+    if(NOT DEFINED GMX_CPU_ACCELERATION)
+        if(GMX_IS_BGQ)
+            set(${_suggested_acceleration} "IBM_QPX")
+        elseif(GMX_IS_X86)
+            gmx_suggest_x86_acceleration(${_suggested_acceleration})
+        else()
+            set(${_suggested_acceleration} "None")
+        endif()
 
+        set(${_suggested_acceleration} ${${_suggested_acceleration}} PARENT_SCOPE)
+    endif()
+endfunction()
diff --git a/cmake/gmxDetectTargetArchitecture.cmake b/cmake/gmxDetectTargetArchitecture.cmake
new file mode 100644
index 0000000000..b4691ac6e1
--- /dev/null
+++ b/cmake/gmxDetectTargetArchitecture.cmake
@@ -0,0 +1,12 @@
+# - Define function to detect whether the compiler's target
+# - architecture is one for which GROMACS has special treatment
+# - (e.g. kernel acceleration)
+#
+# Sets GMX_IS_X86 or GMX_IS_BGQ if targetting that architecture
+
+function(gmx_detect_target_architecture)
+    try_compile(GMX_IS_X86 ${CMAKE_BINARY_DIR}
+        "${CMAKE_SOURCE_DIR}/cmake/TestX86.c")
+    try_compile(GMX_IS_BGQ ${CMAKE_BINARY_DIR}
+        "${CMAKE_SOURCE_DIR}/cmake/TestBlueGeneQ.c")
+endfunction()
diff --git a/cmake/gmxGetCompilerInfo.cmake b/cmake/gmxGetCompilerInfo.cmake
index b1fa45e950..4a1899b5aa 100644
--- a/cmake/gmxGetCompilerInfo.cmake
+++ b/cmake/gmxGetCompilerInfo.cmake
@@ -41,6 +41,8 @@
 # - with cmake <=2.8.8: compilers that accept "-dumpversion" argument:
 #   gcc, Intel Compiler (on Linux and Mac OS), Open64, EkoPath, clang
 #   (and probably other gcc-compatible compilers).
+# - with cmake <=2.8.8: xlC is not supported (it does not take -dumpversion,
+#   but fortunately so far GROMACS never needs to know the version number)
 #
 # C_COMPILER_VERSION    - version string of the current C compiler (CMAKE_C_COMPILER)
 # CXX_COMPILER_VERSION  - version string of the current C++ compiler (CMAKE_CXX_COMPILER)
@@ -50,6 +52,8 @@ macro(get_compiler_version)
         set(_cc_dumpversion_res 0)
         if (DEFINED CMAKE_C_COMPILER_VERSION AND CMAKE_VERSION VERSION_GREATER 2.8.8)
             set(_cc_version ${CMAKE_C_COMPILER_VERSION})
+        elseif (CMAKE_C_COMPILER_ID MATCHES "XL")
+            set(_cc_dumpversion_res 1)
         else()
             execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion
                 RESULT_VARIABLE _cc_dumpversion_res
@@ -70,6 +74,8 @@ macro(get_compiler_version)
         set(_cxx_dumpversion_res 0)
         if (DEFINED CMAKE_CXX_COMPILER_VERSION AND CMAKE_VERSION VERSION_GREATER 2.8.8)
             set(_cxx_version ${CMAKE_CXX_COMPILER_VERSION})
+        elseif (CMAKE_CXX_COMPILER_ID MATCHES "XL")
+            set(_cxx_dumpversion_res 1)
         else()
             execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion
                 RESULT_VARIABLE _cxx_dumpversion_res
@@ -103,12 +109,18 @@ endmacro()
 #   BUILD_FLAGS    - [output variable] flags for the compiler
 #
 macro(get_compiler_info LANGUAGE BUILD_COMPILER BUILD_FLAGS)
-    execute_process(COMMAND ${CMAKE_${LANGUAGE}_COMPILER} --version
+    if (CMAKE_C_COMPILER_ID MATCHES "XL")
+        set(_flag_to_query_version "-qversion")
+    else()
+        set(_flag_to_query_version "--version")
+    endif()
+    execute_process(COMMAND ${CMAKE_${LANGUAGE}_COMPILER} ${_flag_to_query_version}
         RESULT_VARIABLE _exec_result
         OUTPUT_VARIABLE _compiler_version
         ERROR_VARIABLE  _compiler_version)
-    # Try executing just the compiler command --version failed
+
     if(_exec_result)
+        # Try executing just the compiler command, since --version failed
         execute_process(COMMAND ${CMAKE_${LANGUAGE}_COMPILER}
             RESULT_VARIABLE _exec_result
             OUTPUT_VARIABLE _compiler_version
diff --git a/cmake/gmxManageBlueGene.cmake b/cmake/gmxManageBlueGene.cmake
index 490d7748dc..8bae47adc7 100644
--- a/cmake/gmxManageBlueGene.cmake
+++ b/cmake/gmxManageBlueGene.cmake
@@ -59,8 +59,8 @@ set(GMX_GPU OFF CACHE BOOL "Cannot do GPU acceleration on BlueGene" FORCE)
 # facility to run lots of jobs on small chunks of the machine. You
 # certainly need proper MPI to use a whole chunk of the machine that
 # the scheduler will allocate.
-set(GMX_THREAD_MPI OFF CACHE BOOL "Thread-MPI generally not compatible with BlueGene, defaulting to disabled!")
-set(GMX_MPI ON CACHE BOOL "MPI is normally required on BlueGene" FORCE)
+set(GMX_THREAD_MPI OFF CACHE BOOL "GROMACS bundled thread-MPI is not supported on BlueGene" FORCE)
+set(GMX_MPI ON CACHE BOOL "MPI is required on BlueGene" FORCE)
 
 # Access to /etc/passwd is not available on the back end of BlueGeneP
 # (at least), despite being detected by CMake. This can cause linker
diff --git a/cmake/gmxSetBuildInformation.cmake b/cmake/gmxSetBuildInformation.cmake
index aaed53d3b9..d53ba27d32 100644
--- a/cmake/gmxSetBuildInformation.cmake
+++ b/cmake/gmxSetBuildInformation.cmake
@@ -84,36 +84,38 @@ macro(gmx_set_build_information)
 
     if(NOT CMAKE_CROSSCOMPILING)
         # Get CPU acceleration information
+        set(_compile_definitions "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE -DGMX_IS_X86")
         try_run(GMX_CPUID_RUN_VENDOR GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
             ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+            COMPILE_DEFINITIONS ${_compile_definitions}
             RUN_OUTPUT_VARIABLE OUTPUT_CPU_VENDOR ARGS "-vendor")
         try_run(GMX_CPUID_RUN_BRAND GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
             ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+            COMPILE_DEFINITIONS ${_compile_definitions}
             RUN_OUTPUT_VARIABLE OUTPUT_CPU_BRAND ARGS "-brand")
         try_run(GMX_CPUID_RUN_FAMILY GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
             ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+            COMPILE_DEFINITIONS ${_compile_definitions}
             RUN_OUTPUT_VARIABLE OUTPUT_CPU_FAMILY ARGS "-family")
         try_run(GMX_CPUID_RUN_MODEL GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
             ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+            COMPILE_DEFINITIONS ${_compile_definitions}
             RUN_OUTPUT_VARIABLE OUTPUT_CPU_MODEL ARGS "-model")
        try_run(GMX_CPUID_RUN_STEPPING GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
             ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+            COMPILE_DEFINITIONS ${_compile_definitions}
             RUN_OUTPUT_VARIABLE OUTPUT_CPU_STEPPING ARGS "-stepping")
         try_run(GMX_CPUID_RUN_FEATURES GMX_CPUID_COMPILED
             ${CMAKE_BINARY_DIR}
             ${CMAKE_SOURCE_DIR}/src/gmxlib/gmx_cpuid.c
-            COMPILE_DEFINITIONS "@GCC_INLINE_ASM_DEFINE@ -I${CMAKE_SOURCE_DIR}/include -DGMX_CPUID_STANDALONE"
+            COMPILE_DEFINITIONS ${_compile_definitions}
             RUN_OUTPUT_VARIABLE OUTPUT_CPU_FEATURES ARGS "-features")
+        unset(_compile_definitions)
 
         string(STRIP "@OUTPUT_CPU_VENDOR@" OUTPUT_CPU_VENDOR)
         string(STRIP "@OUTPUT_CPU_BRAND@" OUTPUT_CPU_BRAND)
diff --git a/cmake/gmxTestInlineASM.cmake b/cmake/gmxTestInlineASM.cmake
index df509858a1..f6101ad4d0 100644
--- a/cmake/gmxTestInlineASM.cmake
+++ b/cmake/gmxTestInlineASM.cmake
@@ -37,8 +37,6 @@
 #  GMX_TEST_INLINE_ASM_GCC_X86(VARIABLE)
 #
 #  VARIABLE will be set to true if GCC x86 inline asm works.
-#
-#  Remember to have a cmakedefine for it too...
 
 MACRO(GMX_TEST_INLINE_ASM_GCC_X86 VARIABLE)
     IF(NOT DEFINED ${VARIABLE})
diff --git a/include/gmx_simd4_macros.h b/include/gmx_simd4_macros.h
index 7ee1581807..5e5bb48768 100644
--- a/include/gmx_simd4_macros.h
+++ b/include/gmx_simd4_macros.h
@@ -90,6 +90,7 @@ typedef float   gmx_simd4_real;
 #define gmx_simd4_pb  gmx_simd4_ref_pb
 
 #define gmx_simd4_load_pr       gmx_simd4_ref_load_pr
+#define gmx_simd4_load_bb_pr    gmx_simd4_ref_load_pr
 #define gmx_simd4_set1_pr       gmx_simd4_ref_set1_pr
 #define gmx_simd4_setzero_pr    gmx_simd4_ref_setzero_pr
 #define gmx_simd4_store_pr      gmx_simd4_ref_store_pr
@@ -184,6 +185,7 @@ typedef float   gmx_simd4_real;
 #define gmx_simd4_pb  __m128
 
 #define gmx_simd4_load_pr       _mm_load_ps
+#define gmx_simd4_load_bb_pr    _mm_load_ps
 #define gmx_simd4_set1_pr       _mm_set1_ps
 #define gmx_simd4_setzero_pr    _mm_setzero_ps
 #define gmx_simd4_store_pr      _mm_store_ps
@@ -253,6 +255,7 @@ static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b)
 #define gmx_simd4_pb  __m256d
 
 #define gmx_simd4_load_pr       _mm256_load_pd
+#define gmx_simd4_load_bb_pr    _mm256_load_pd
 #define gmx_simd4_set1_pr       _mm256_set1_pd
 #define gmx_simd4_setzero_pr    _mm256_setzero_pd
 #define gmx_simd4_store_pr      _mm256_store_pd
@@ -288,9 +291,168 @@ static inline float gmx_simd4_dotproduct3(__m128 a, __m128 b)
 
 #endif /* GMX_HAVE_SIMD4_MACROS */
 
-
 #endif /* GMX_X86_SSE2 */
 
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+/* i.e. BlueGene/Q */
+
+/* This hack works on the compilers that can reach this code. A real
+   solution with broader scope will be proposed in master branch. */
+#define gmx_always_inline __attribute__((always_inline))
+
+#ifdef GMX_SIMD4_SINGLE
+#define GMX_HAVE_SIMD4_MACROS
+#endif
+
+typedef vector4double gmx_simd4_pr;
+typedef vector4double gmx_simd4_pb;
+
+/* The declarations of vec_ld* use non-const pointers, and IBM
+   can't/won't fix this any time soon. So GROMACS has to cast away the
+   const-ness of its pointers before loads. Four-wide SIMD loads
+   sometimes occur from variables of type real, and sometimes from
+   variables of type float (even at double precison), so the correct
+   cast cannot be done easily. The correct cast is necessary because
+   the resulting type determines the alignment assumption of vec_ld*,
+   which is different for float and double. So the loads of
+   always-float variables have to be done with a function that does
+   the correct cast. Since functions cannot be overloaded by type in
+   C, they have to have different names. Thus we have
+   gmx_simd4_load_pr and gmx_simd4_load_bb_pr.
+ */
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_pr(const real *a)
+{
+#ifdef NDEBUG
+    return vec_ld(0, (real *) a);
+#else
+    return vec_lda(0, (real *) a);
+#endif
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_load_bb_pr(const float *a)
+{
+#ifdef NDEBUG
+    return vec_ld(0, (float *) a);
+#else
+    return vec_lda(0, (float *) a);
+#endif
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_set1_pr(const real a)
+{
+    return vec_splats(a);
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_setzero_pr()
+{
+    return vec_splats(0.0);
+}
+
+/* TODO this will not yet work, because the function might be passed a
+   pointer to a float when running in double precision.
+ */
+static gmx_inline void gmx_always_inline gmx_simd4_store_pr(real *a, gmx_simd4_pr b)
+{
+#ifdef NDEBUG
+    vec_st(b, 0, a);
+#else
+    vec_sta(b, 0, a);
+#endif
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_add_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+    return vec_add(a, b);
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_sub_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+    return vec_sub(a, b);
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_mul_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+    return vec_mul(a, b);
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_madd_pr(gmx_simd4_pr a, gmx_simd4_pr b, gmx_simd4_pr c)
+{
+    return vec_madd(a, b, c);
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_nmsub_pr(gmx_simd4_pr a, gmx_simd4_pr b, gmx_simd4_pr c)
+{
+    return vec_nmsub(a, b, c);
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_min_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+    /* Implemented the same way as max, but with the subtraction
+       operands swapped. */
+    return vec_sel(b, a, vec_sub(b, a));
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_max_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+    return vec_sel(b, a, vec_sub(a, b));
+}
+
+static gmx_inline gmx_simd4_pr gmx_always_inline gmx_simd4_blendzero_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+    return vec_sel(gmx_setzero_pr(), a, b);
+}
+
+static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_cmplt_pr(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+    return vec_cmplt(a, b);
+}
+
+static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_and_pb(gmx_simd4_pb a, gmx_simd4_pb b)
+{
+    return vec_and(a, b);
+}
+
+static gmx_inline gmx_simd4_pb gmx_always_inline gmx_simd4_or_pb(gmx_simd4_pb a, gmx_simd4_pb b)
+{
+    return vec_or(a, b);
+}
+
+static gmx_inline float gmx_always_inline gmx_simd4_dotproduct3(gmx_simd4_pr a, gmx_simd4_pr b)
+{
+    /* The dot product is done solely on the QPX AXU (which is the
+       only available FPU). This is awkward, because pretty much no
+       "horizontal" SIMD-vector operations exist, unlike x86 where
+       SSE4.1 added various kinds of horizontal operations. So we have
+       to make do with shifting vector elements and operating on the
+       results. This makes for lots of data dependency, but the main
+       alternative of storing to memory and reloading is not going to
+       help, either. OpenMP over 2 or 4 hardware threads per core will
+       hide much of the latency from the data dependency. The
+       vec_extract() lets the compiler correctly use a floating-point
+       comparison on the zeroth vector element, which avoids needing
+       memory at all.
+     */
+
+    gmx_simd4_pr dp_shifted_left_0 = vec_mul(a, b);
+    gmx_simd4_pr dp_shifted_left_1 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 1);
+    gmx_simd4_pr dp_shifted_left_2 = vec_sldw(dp_shifted_left_0, dp_shifted_left_0, 2);
+    gmx_simd4_pr dp                = vec_add(dp_shifted_left_2,
+                                             vec_add(dp_shifted_left_0, dp_shifted_left_1));
+
+    /* See comment in nbnxn_make_pairlist_part() about how this should
+       be able to return a double on PowerPC. */
+    return (float) vec_extract(dp, 0);
+}
+
+static gmx_inline int gmx_always_inline gmx_simd4_anytrue_pb(gmx_simd4_pb a)
+{
+    return gmx_anytrue_pb(a);
+}
+
+#undef gmx_always_inline
+
+#endif /* GMX_CPU_ACCELERATION_IBM_QPX */
 
 #ifdef GMX_HAVE_SIMD4_MACROS
 /* Generic functions to extract a SIMD4 aligned pointer from a pointer x.
diff --git a/include/gmx_simd_macros.h b/include/gmx_simd_macros.h
index 3491bd5dd6..8d2c8665ae 100644
--- a/include/gmx_simd_macros.h
+++ b/include/gmx_simd_macros.h
@@ -167,6 +167,7 @@
 #endif
 #endif
 
+#ifdef GMX_IS_X86
 
 #ifdef GMX_X86_SSE2
 /* This is for general x86 SIMD instruction sets that also support SSE2 */
@@ -177,39 +178,40 @@
 #include "gmx_x86_avx_256.h"
 #ifdef GMX_DOUBLE
 #include "gmx_math_x86_avx_256_double.h"
-#else
+#else  /* GMX_DOUBLE */
 #include "gmx_math_x86_avx_256_single.h"
-#endif
-#else
+#endif /* GMX_DOUBLE */
+#else  /* GMX_X86_AVX_256 */
 #ifdef GMX_X86_AVX_128_FMA
 #include "gmx_x86_avx_128_fma.h"
 #ifdef GMX_DOUBLE
 #include "gmx_math_x86_avx_128_fma_double.h"
-#else
+#else  /* GMX_DOUBLE */
 #include "gmx_math_x86_avx_128_fma_single.h"
-#endif
-#else
+#endif /* GMX_DOUBLE */
+#else  /* GMX_X86_AVX_128_FMA */
 #ifdef GMX_X86_SSE4_1
 #include "gmx_x86_sse4_1.h"
 #ifdef GMX_DOUBLE
 #include "gmx_math_x86_sse4_1_double.h"
-#else
+#else  /* GMX_DOUBLE */
 #include "gmx_math_x86_sse4_1_single.h"
-#endif
-#else
+#endif /* GMX_DOUBLE */
+#else  /* GMX_X86_SSE4_1 */
 #ifdef GMX_X86_SSE2
 #include "gmx_x86_sse2.h"
 #ifdef GMX_DOUBLE
 #include "gmx_math_x86_sse2_double.h"
-#else
+#else  /* GMX_DOUBLE */
 #include "gmx_math_x86_sse2_single.h"
-#endif
-#else
+#endif /* GMX_DOUBLE */
+#else  /* GMX_X86_SSE2 */
 #error No x86 acceleration defined
-#endif
-#endif
-#endif
-#endif
+#endif /* GMX_X86_SSE2 */
+#endif /* GMX_X86_SSE4_1 */
+#endif /* GMX_X86_AVX_128_FMA */
+#endif /* GMX_X86_AVX_256 */
+
 /* exp and trigonometric functions are included above */
 #define GMX_SIMD_HAVE_EXP
 #define GMX_SIMD_HAVE_TRIGONOMETRIC
@@ -271,7 +273,10 @@ static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
     return _mm_or_ps(_mm_and_ps(a, sign_mask), b);
 };
 
-static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_ps(b, _mm_andnot_ps(a, c)); };
+static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+{
+    return _mm_add_ps(b, _mm_andnot_ps(a, c));
+};
 
 #define gmx_anytrue_pb    _mm_movemask_ps
 
@@ -338,7 +343,10 @@ static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
     return _mm_or_pd(_mm_and_pd(a, sign_mask), b);
 };
 
-static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm_add_pd(b, _mm_andnot_pd(a, c)); };
+static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+{
+    return _mm_add_pd(b, _mm_andnot_pd(a, c));
+};
 
 #define gmx_cmplt_pr      _mm_cmplt_pd
 
@@ -404,7 +412,10 @@ static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
     return _mm256_or_ps(_mm256_and_ps(a, sign_mask), b);
 };
 
-static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_ps(b, _mm256_andnot_ps(a, c)); };
+static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+{
+    return _mm256_add_ps(b, _mm256_andnot_ps(a, c));
+};
 
 /* Less-than (we use ordered, non-signaling, but that's not required) */
 #define gmx_cmplt_pr(x, y) _mm256_cmp_ps(x, y, 0x11)
@@ -463,7 +474,10 @@ static gmx_inline gmx_mm_pr gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
     return _mm256_or_pd(_mm256_and_pd(a, sign_mask), b);
 };
 
-static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c) { return _mm256_add_pd(b, _mm256_andnot_pd(a, c)); };
+static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+{
+    return _mm256_add_pd(b, _mm256_andnot_pd(a, c));
+};
 
 /* Less-than (we use ordered, non-signaling, but that's not required) */
 #define gmx_cmplt_pr(x, y) _mm256_cmp_pd(x, y, 0x11)
@@ -490,6 +504,285 @@ static gmx_inline gmx_mm_pr gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_
 
 #endif /* GMX_X86_SSE2 */
 
+#endif /* GMX_IS_X86 */
+
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+
+/* This hack works on the compilers that can reach this code. A real
+   solution with broader scope will be proposed in master branch. */
+#define gmx_always_inline __attribute__((always_inline))
+
+/* This is for the A2 core on BlueGene/Q that supports IBM's QPX
+   vector built-in functions */
+#define GMX_HAVE_SIMD_MACROS
+#ifdef __clang__
+#include <qpxmath.h>
+#else
+#include "mass_simd.h"
+#endif
+
+/* No need to version the code by the precision, because the QPX AXU
+   extends to and truncates from double precision for free. */
+
+#define GMX_SIMD_WIDTH_HERE  4
+typedef vector4double gmx_mm_pr;
+typedef vector4double gmx_mm_pb;
+typedef vector4double gmx_epi32;
+#define GMX_SIMD_EPI32_WIDTH  4
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_load_pr(const real *a)
+{
+#ifdef NDEBUG
+    return vec_ld(0, (real *) a);
+#else
+    return vec_lda(0, (real *) a);
+#endif
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_load1_pr(const real *a)
+{
+    return vec_splats(*a);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_set1_pr(real a)
+{
+    return vec_splats(a);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_setzero_pr()
+{
+    return vec_splats(0.0);
+}
+
+static gmx_inline void gmx_always_inline gmx_store_pr(real *a, gmx_mm_pr b)
+{
+#ifdef NDEBUG
+    vec_st(b, 0, a);
+#else
+    vec_sta(b, 0, a);
+#endif
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_add_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+    return vec_add(a, b);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_sub_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+    return vec_sub(a, b);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_mul_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+    return vec_mul(a, b);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_madd_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c)
+{
+    return vec_madd(a, b, c);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_nmsub_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c)
+{
+    return vec_nmsub(a, b, c);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_max_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+    return vec_sel(b, a, vec_sub(a, b));
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_blendzero_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+    return vec_sel(gmx_setzero_pr(), a, b);
+}
+
+static gmx_inline gmx_mm_pb gmx_always_inline gmx_cmplt_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+    return vec_cmplt(a, b);
+}
+
+static gmx_inline gmx_mm_pb gmx_always_inline gmx_and_pb(gmx_mm_pb a, gmx_mm_pb b)
+{
+    return vec_and(a, b);
+}
+
+static gmx_inline gmx_mm_pb gmx_always_inline gmx_or_pb(gmx_mm_pb a, gmx_mm_pb b)
+{
+    return vec_or(a, b);
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_round_pr(gmx_mm_pr a)
+{
+    return vec_round(a);
+}
+
+#define GMX_SIMD_HAVE_FLOOR
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_floor_pr(gmx_mm_pr a)
+{
+    return vec_floor(a);
+}
+
+#define GMX_SIMD_HAVE_BLENDV
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_blendv_pr(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c)
+{
+    return vec_sel(b, a, gmx_cmplt_pr(gmx_setzero_pr(), c));
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_cpsgn_nonneg_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+    return vec_cpsgn(a, b);
+};
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_masknot_add_pr(gmx_mm_pb a, gmx_mm_pr b, gmx_mm_pr c)
+{
+    return vec_add(b, vec_sel(c, gmx_setzero_pr(), a));
+};
+
+static gmx_inline gmx_bool gmx_always_inline
+GMX_SIMD_IS_TRUE(real x)
+{
+    return x >= 0.0;
+}
+
+static gmx_inline gmx_epi32 gmx_always_inline gmx_cvttpr_epi32(gmx_mm_pr a)
+{
+    return vec_ctiwuz(a);
+}
+/* Don't want this, we have floor */
+/* #define gmx_cvtepi32_pr   vec_cvtepi32 */
+
+/* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA
+   Architecture only promises 2^-8. So probably no need for
+   Newton-Raphson iterates at single or double. */
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_rsqrt_pr(gmx_mm_pr a)
+{
+    return vec_rsqrte(a);
+}
+
+/* A2 core on BG/Q delivers relative error of 2^-14, whereas Power ISA
+   Architecture only promises 2^-5. So probably no need for
+   Newton-Raphson iterates at single or double. */
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_rcp_pr(gmx_mm_pr a)
+{
+    return vec_re(a);
+}
+
+/* Note that here, and below, we use the built-in SLEEF port when
+   compiling on BlueGene/Q with clang */
+
+#define GMX_SIMD_HAVE_EXP
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_exp_pr(gmx_mm_pr a)
+{
+#ifdef __clang__
+#ifndef GMX_DOUBLE
+    return xexpf(a);
+#else
+    return xexp(a);
+#endif
+#else
+#ifndef GMX_DOUBLE
+    return expf4(a);
+#else
+    return expd4(a);
+#endif
+#endif
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_sqrt_pr(gmx_mm_pr a)
+{
+#ifdef NDEBUG
+    return vec_swsqrt_nochk(a);
+#else
+    return vec_swsqrt(a);
+#endif
+}
+
+#define GMX_SIMD_HAVE_TRIGONOMETRIC
+static gmx_inline int gmx_always_inline gmx_sincos_pr(gmx_mm_pr a, gmx_mm_pr *b, gmx_mm_pr *c)
+{
+#ifdef __clang__
+#ifndef GMX_DOUBLE
+    xsincosf(a, b, c);
+#else
+    xsincos(a, b, c);
+#endif
+#else
+#ifndef GMX_DOUBLE
+    sincosf4(a, b, c);
+#else
+    sincosd4(a, b, c);
+#endif
+#endif
+    return 1;
+}
+
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_acos_pr(gmx_mm_pr a)
+{
+#ifdef __clang__
+#ifndef GMX_DOUBLE
+    return xacosf(a);
+#else
+    return xacos(a);
+#endif
+#else
+#ifndef GMX_DOUBLE
+    return acosf4(a);
+#else
+    return acosd4(a);
+#endif
+#endif
+}
+
+/* NB The order of parameters here is correct; the
+   documentation of atan2[df]4 in SIMD MASS is wrong. */
+static gmx_inline gmx_mm_pr gmx_always_inline gmx_atan2_pr(gmx_mm_pr a, gmx_mm_pr b)
+{
+#ifdef __clang__
+#ifndef GMX_DOUBLE
+    return xatan2f(a, b);
+#else
+    return xatan2(a, b);
+#endif
+#else
+#ifndef GMX_DOUBLE
+    return atan2f4(a, b);
+#else
+    return atan2d4(a, b);
+#endif
+#endif
+}
+
+static gmx_inline int gmx_always_inline
+gmx_anytrue_pb(gmx_mm_pb a)
+{
+    /* The "anytrue" is done solely on the QPX AXU (which is the only
+       available FPU). This is awkward, because pretty much no
+       "horizontal" SIMD-vector operations exist, unlike x86 where
+       SSE4.1 added various kinds of horizontal operations. So we have
+       to make do with shifting vector elements and operating on the
+       results. This makes for lots of data dependency, but the main
+       alternative of storing to memory and reloading is not going to
+       help, either. OpenMP over 2 or 4 hardware threads per core will
+       hide much of the latency from the data dependency. The
+       vec_extract() lets the compiler correctly use a floating-point
+       comparison on the zeroth vector element, which avoids needing
+       memory at all.
+     */
+    gmx_mm_pb vec_shifted_left_0 = a;
+    gmx_mm_pb vec_shifted_left_1 = vec_sldw(a, a, 1);
+    gmx_mm_pb vec_shifted_left_2 = vec_sldw(a, a, 2);
+    gmx_mm_pb vec_shifted_left_3 = vec_sldw(a, a, 3);
+
+    gmx_mm_pb vec_return = vec_or(vec_or(vec_shifted_left_2, vec_shifted_left_3),
+                                  vec_or(vec_shifted_left_0, vec_shifted_left_1));
+    return (0.0 < vec_extract(vec_return, 0));
+};
+
+#undef gmx_always_inline
+
+#endif /* GMX_CPU_ACCELERATION_IBM_QPX */
 
 #ifdef GMX_HAVE_SIMD_MACROS
 /* Generic functions to extract a SIMD aligned pointer from a pointer x.
@@ -519,6 +812,7 @@ gmx_simd_align_int(const int *x)
 #include "gmx_simd_math_single.h"
 #endif
 
+
 #endif /* GMX_HAVE_SIMD_MACROS */
 
 #endif /* _gmx_simd_macros_h_ */
diff --git a/include/gmx_simd_ref.h b/include/gmx_simd_ref.h
index 082132ed5c..a1b78400b5 100644
--- a/include/gmx_simd_ref.h
+++ b/include/gmx_simd_ref.h
@@ -347,8 +347,9 @@ gmx_simd_ref_cmplt_pr(gmx_simd_ref_pr a, gmx_simd_ref_pr b)
     return c;
 }
 
-/* Logical AND on SIMD booleans */
-static gmx_inline gmx_simd_ref_pb
+/* Logical AND on SIMD booleans. Can't be static or it can't be a
+   template parameter (at least on XLC for BlueGene/Q) */
+gmx_inline gmx_simd_ref_pb
 gmx_simd_ref_and_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
 {
     gmx_simd_ref_pb c;
@@ -362,8 +363,9 @@ gmx_simd_ref_and_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
     return c;
 }
 
-/* Logical OR on SIMD booleans */
-static gmx_inline gmx_simd_ref_pb
+/* Logical OR on SIMD booleans. Can't be static or it can't be a
+   template parameter (at least on XLC for BlueGene/Q) */
+gmx_inline gmx_simd_ref_pb
 gmx_simd_ref_or_pb(gmx_simd_ref_pb a, gmx_simd_ref_pb b)
 {
     gmx_simd_ref_pb c;
diff --git a/include/network.h b/include/network.h
index 66bbd03eb5..5b00c546ea 100644
--- a/include/network.h
+++ b/include/network.h
@@ -67,8 +67,11 @@ int gmx_node_rank(void);
 
 GMX_LIBGMX_EXPORT
 int gmx_hostname_num(void);
-/* If the first part of the hostname (up to the first dot) ends with a number, returns this number.
-   If the first part of the hostname does not ends in a number (0-9 characters), returns 0.
+/* Ostensibly, returns a integer characteristic of and unique to each
+   physical node in the MPI system. If the first part of the MPI
+   hostname (up to the first dot) ends with a number, returns this
+   number. If the first part of the MPI hostname does not ends in a
+   number (0-9 characters), returns 0.
  */
 
 GMX_LIBGMX_EXPORT
diff --git a/include/types/nb_verlet.h b/include/types/nb_verlet.h
index 19a0e14a0d..f2a8e4b051 100644
--- a/include/types/nb_verlet.h
+++ b/include/types/nb_verlet.h
@@ -58,7 +58,7 @@ extern "C" {
 /* #define GMX_NBNXN_SIMD_2XNN */
 
 
-#ifdef GMX_X86_SSE2
+#if (defined GMX_X86_SSE2) || (defined GMX_CPU_ACCELERATION_IBM_QPX)
 /* Use SIMD accelerated nbnxn search and kernels */
 #define GMX_NBNXN_SIMD
 
diff --git a/include/types/nbnxn_pairlist.h b/include/types/nbnxn_pairlist.h
index 16512941c8..364290bee8 100644
--- a/include/types/nbnxn_pairlist.h
+++ b/include/types/nbnxn_pairlist.h
@@ -39,6 +39,10 @@
 #ifndef _nbnxn_pairlist_h
 #define _nbnxn_pairlist_h
 
+#ifdef HAVE_CONFIG_H
+#  include <config.h>
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -74,8 +78,12 @@ typedef void nbnxn_free_t (void *ptr);
  * is found, all subsequent j-entries in the i-entry also have full masks.
  */
 typedef struct {
-    int      cj;    /* The j-cluster                             */
-    unsigned excl;  /* The topology exclusion (interaction) bits */
+    int      cj;    /* The j-cluster                    */
+    unsigned excl;  /* The exclusion (interaction) bits */
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+    /* Indices into the arrays of SIMD interaction masks. */
+    char     interaction_mask_indices[4];
+#endif
 } nbnxn_cj_t;
 
 /* In nbnxn_ci_t the integer shift contains the shift in the lower 7 bits.
@@ -257,12 +265,14 @@ typedef struct {
      */
     unsigned                *simd_exclusion_filter1;
     unsigned                *simd_exclusion_filter2;
-
-    int                      nout;            /* The number of force arrays                         */
-    nbnxn_atomdata_output_t *out;             /* Output data structures               */
-    int                      nalloc;          /* Allocation size of all arrays (for x/f *x/fstride) */
-    gmx_bool                 bUseBufferFlags; /* Use the flags or operate on all atoms     */
-    nbnxn_buffer_flags_t     buffer_flags;    /* Flags for buffer zeroing+reduc.  */
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+    real                    *simd_interaction_array; /* Array of masks needed for exclusions on QPX */
+#endif
+    int                      nout;                   /* The number of force arrays                         */
+    nbnxn_atomdata_output_t *out;                    /* Output data structures               */
+    int                      nalloc;                 /* Allocation size of all arrays (for x/f *x/fstride) */
+    gmx_bool                 bUseBufferFlags;        /* Use the flags or operate on all atoms     */
+    nbnxn_buffer_flags_t     buffer_flags;           /* Flags for buffer zeroing+reduc.  */
 } nbnxn_atomdata_t;
 
 #ifdef __cplusplus
diff --git a/src/config.h.cmakein b/src/config.h.cmakein
index f023d21787..4c49a0e311 100644
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -93,6 +93,12 @@
 /* Use AMD core math library */
 #cmakedefine GMX_FFT_ACML
 
+/* Target platform is x86 or x86_64 */
+#cmakedefine GMX_IS_X86
+
+/* Target platform is BlueGene/Q */
+#cmakedefine GMX_IS_BGQ
+
 /* SSE2 instructions available */
 #cmakedefine GMX_X86_SSE2
 
diff --git a/src/gmxlib/gmx_cpuid.c b/src/gmxlib/gmx_cpuid.c
index 9baa2080d4..be80f54eb2 100644
--- a/src/gmxlib/gmx_cpuid.c
+++ b/src/gmxlib/gmx_cpuid.c
@@ -63,7 +63,7 @@
 /* For convenience, and to enable configure-time invocation, we keep all architectures
  * in a single file, but to avoid repeated ifdefs we set the overall architecture here.
  */
-#if defined (__i386__) || defined (__x86_64__) || defined (_M_IX86) || defined (_M_X64)
+#ifdef GMX_IS_X86
 /* OK, it is x86, but can we execute cpuid? */
 #if defined(GMX_X86_GCC_INLINE_ASM) || ( defined(_MSC_VER) && ( (_MSC_VER > 1500) || (_MSC_VER==1500 & _MSC_FULL_VER >= 150030729)))
 #    define GMX_CPUID_X86
diff --git a/src/gmxlib/network.c b/src/gmxlib/network.c
index 607140e299..4174f40ffb 100644
--- a/src/gmxlib/network.c
+++ b/src/gmxlib/network.c
@@ -260,6 +260,9 @@ int gmx_node_rank(void)
 #endif
 }
 
+#if defined GMX_LIB_MPI && defined GMX_IS_BGQ
+#include <spi/include/kernel/location.h>
+#endif
 
 int gmx_hostname_num()
 {
@@ -277,6 +280,28 @@ int gmx_hostname_num()
     char mpi_hostname[MPI_MAX_PROCESSOR_NAME], hostnum_str[MPI_MAX_PROCESSOR_NAME];
 
     MPI_Get_processor_name(mpi_hostname, &resultlen);
+#ifdef GMX_IS_BGQ
+    Personality_t personality;
+    Kernel_GetPersonality(&personality, sizeof(personality));
+    /* Each MPI rank has a unique coordinate in a 6-dimensional space
+       (A,B,C,D,E,T), with dimensions A-E corresponding to different
+       physical nodes, and T within each node. Each node has sixteen
+       physical cores, each of which can have up to four hardware
+       threads, so 0 <= T <= 63 (but the maximum value of T depends on
+       the confituration of ranks and OpenMP threads per
+       node). However, T is irrelevant for computing a suitable return
+       value for gmx_hostname_num().
+     */
+    hostnum  = personality.Network_Config.Acoord;
+    hostnum *= personality.Network_Config.Bnodes;
+    hostnum += personality.Network_Config.Bcoord;
+    hostnum *= personality.Network_Config.Cnodes;
+    hostnum += personality.Network_Config.Ccoord;
+    hostnum *= personality.Network_Config.Dnodes;
+    hostnum += personality.Network_Config.Dcoord;
+    hostnum *= personality.Network_Config.Enodes;
+    hostnum += personality.Network_Config.Ecoord;
+#else
     /* This procedure can only differentiate nodes with host names
      * that end on unique numbers.
      */
@@ -301,11 +326,32 @@ int gmx_hostname_num()
         /* Use only the last 9 decimals, so we don't overflow an int */
         hostnum = strtol(hostnum_str + max(0, j-9), NULL, 10);
     }
+#endif
 
     if (debug)
     {
-        fprintf(debug, "In gmx_setup_nodecomm: hostname '%s', hostnum %d\n",
+        fprintf(debug, "In gmx_hostname_num: hostname '%s', hostnum %d\n",
                 mpi_hostname, hostnum);
+#ifdef GMX_IS_BGQ
+        fprintf(debug,
+                "Torus ID A: %d / %d B: %d / %d C: %d / %d D: %d / %d E: %d / %d\nNode ID T: %d / %d core: %d / %d hardware thread: %d / %d\n",
+                personality.Network_Config.Acoord,
+                personality.Network_Config.Anodes,
+                personality.Network_Config.Bcoord,
+                personality.Network_Config.Bnodes,
+                personality.Network_Config.Ccoord,
+                personality.Network_Config.Cnodes,
+                personality.Network_Config.Dcoord,
+                personality.Network_Config.Dnodes,
+                personality.Network_Config.Ecoord,
+                personality.Network_Config.Enodes,
+                Kernel_ProcessorCoreID(),
+                16,
+                Kernel_ProcessorID(),
+                64,
+                Kernel_ProcessorThreadID(),
+                4);
+#endif
     }
     return hostnum;
 #endif
diff --git a/src/kernel/runner.c b/src/kernel/runner.c
index a3ec7bf62d..147337e6ca 100644
--- a/src/kernel/runner.c
+++ b/src/kernel/runner.c
@@ -1011,6 +1011,15 @@ int mdrunner(gmx_hw_opt_t *hw_opt,
                 gmx_fatal(FARGS, "GPU requested, but can't be used without cutoff-scheme=Verlet");
             }
         }
+#ifdef GMX_IS_BGQ
+        else
+        {
+            md_print_warn(cr, fplog,
+                          "NOTE: There is no SIMD implementation of the group scheme kernels on\n"
+                          "      BlueGene/Q. You will observe better performance from using the\n"
+                          "      Verlet cut-off scheme.\n");
+        }
+#endif
     }
 #ifndef GMX_THREAD_MPI
     if (PAR(cr))
diff --git a/src/mdlib/forcerec.c b/src/mdlib/forcerec.c
index 0e3f0a10e1..2b79f8e447 100644
--- a/src/mdlib/forcerec.c
+++ b/src/mdlib/forcerec.c
@@ -1523,11 +1523,12 @@ static void pick_nbnxn_kernel_cpu(FILE             *fp,
 #endif
         }
 
-        /* Analytical Ewald exclusion correction is only an option in the
-         * x86 SIMD kernel. This is faster in single precision
-         * on Bulldozer and slightly faster on Sandy Bridge.
+        /* Analytical Ewald exclusion correction is only an option in
+         * the SIMD kernel. On BlueGene/Q, this is faster regardless
+         * of precision. In single precision, this is faster on
+         * Bulldozer, and slightly faster on Sandy Bridge.
          */
-#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
+#if ((defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE) || (defined GMX_CPU_ACCELERATION_IBM_QPX)
         *ewald_excl = ewaldexclAnalytical;
 #endif
         if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
@@ -1540,7 +1541,7 @@ static void pick_nbnxn_kernel_cpu(FILE             *fp,
         }
 
     }
-#endif /* GMX_X86_SSE2 */
+#endif /* GMX_NBNXN_SIMD */
 }
 
 
@@ -1584,11 +1585,11 @@ const char *lookup_nbnxn_kernel_name(int kernel_type)
 #endif
 #endif
 #endif
-#else /* GMX_X86_SSE2 */
+#else   /* GMX_X86_SSE2 */
             /* not GMX_X86_SSE2, but other SIMD */
             returnvalue  = "SIMD";
 #endif /* GMX_X86_SSE2 */
-#else /* GMX_NBNXN_SIMD */
+#else  /* GMX_NBNXN_SIMD */
             returnvalue = "not available";
 #endif /* GMX_NBNXN_SIMD */
             break;
diff --git a/src/mdlib/nbnxn_atomdata.c b/src/mdlib/nbnxn_atomdata.c
index a82a3b3275..5457ab5a36 100644
--- a/src/mdlib/nbnxn_atomdata.c
+++ b/src/mdlib/nbnxn_atomdata.c
@@ -48,7 +48,6 @@
 #include "nbnxn_consts.h"
 #include "nbnxn_internal.h"
 #include "nbnxn_search.h"
-#include "nbnxn_atomdata.h"
 #include "gmx_omp_nthreads.h"
 
 /* Default nbnxn allocation routine, allocates NBNXN_MEM_ALIGN byte aligned */
@@ -429,6 +428,91 @@ static void set_combination_rule_data(nbnxn_atomdata_t *nbat)
     }
 }
 
+#ifdef GMX_NBNXN_SIMD
+static void
+nbnxn_atomdata_init_simple_exclusion_masks(nbnxn_atomdata_t *nbat)
+{
+    int       i, j;
+    const int simd_width = GMX_SIMD_WIDTH_HERE;
+    int       simd_excl_size;
+    /* Set the diagonal cluster pair exclusion mask setup data.
+     * In the kernel we check 0 < j - i to generate the masks.
+     * Here we store j - i for generating the mask for the first i,
+     * we substract 0.5 to avoid rounding issues.
+     * In the kernel we can subtract 1 to generate the subsequent mask.
+     */
+    int        simd_4xn_diag_size;
+    const real simdFalse = -1, simdTrue = 1;
+    real      *simd_interaction_array;
+
+    simd_4xn_diag_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
+    snew_aligned(nbat->simd_4xn_diagonal_j_minus_i, simd_4xn_diag_size, NBNXN_MEM_ALIGN);
+    for (j = 0; j < simd_4xn_diag_size; j++)
+    {
+        nbat->simd_4xn_diagonal_j_minus_i[j] = j - 0.5;
+    }
+
+    snew_aligned(nbat->simd_2xnn_diagonal_j_minus_i, simd_width, NBNXN_MEM_ALIGN);
+    for (j = 0; j < simd_width/2; j++)
+    {
+        /* The j-cluster size is half the SIMD width */
+        nbat->simd_2xnn_diagonal_j_minus_i[j]              = j - 0.5;
+        /* The next half of the SIMD width is for i + 1 */
+        nbat->simd_2xnn_diagonal_j_minus_i[simd_width/2+j] = j - 1 - 0.5;
+    }
+
+    /* We use up to 32 bits for exclusion masking.
+     * The same masks are used for the 4xN and 2x(N+N) kernels.
+     * The masks are read either into epi32 SIMD registers or into
+     * real SIMD registers (together with a cast).
+     * In single precision this means the real and epi32 SIMD registers
+     * are of equal size.
+     * In double precision the epi32 registers can be smaller than
+     * the real registers, so depending on the architecture, we might
+     * need to use two, identical, 32-bit masks per real.
+     */
+    simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width;
+    snew_aligned(nbat->simd_exclusion_filter1, simd_excl_size,   NBNXN_MEM_ALIGN);
+    snew_aligned(nbat->simd_exclusion_filter2, simd_excl_size*2, NBNXN_MEM_ALIGN);
+
+    for (j = 0; j < simd_excl_size; j++)
+    {
+        /* Set the consecutive bits for masking pair exclusions */
+        nbat->simd_exclusion_filter1[j]       = (1U << j);
+        nbat->simd_exclusion_filter2[j*2 + 0] = (1U << j);
+        nbat->simd_exclusion_filter2[j*2 + 1] = (1U << j);
+    }
+
+#if (defined GMX_CPU_ACCELERATION_IBM_QPX)
+    /* The QPX kernels shouldn't do the bit masking that is done on
+     * x86, because the SIMD units lack bit-wise operations. Instead,
+     * we generate a vector of all 2^4 possible ways an i atom
+     * interacts with its 4 j atoms. Each array entry contains
+     * simd_width signed ints that are read in a single SIMD
+     * load. These ints must contain values that will be interpreted
+     * as true and false when loaded in the SIMD floating-point
+     * registers, ie. any positive or any negative value,
+     * respectively. Each array entry encodes how this i atom will
+     * interact with the 4 j atoms. Matching code exists in
+     * set_ci_top_excls() to generate indices into this array. Those
+     * indices are used in the kernels. */
+
+    simd_excl_size = NBNXN_CPU_CLUSTER_I_SIZE*NBNXN_CPU_CLUSTER_I_SIZE;
+    const int qpx_simd_width = GMX_SIMD_WIDTH_HERE;
+    snew_aligned(simd_interaction_array, simd_excl_size * qpx_simd_width, NBNXN_MEM_ALIGN);
+    for (j = 0; j < simd_excl_size; j++)
+    {
+        int index = j * qpx_simd_width;
+        for (i = 0; i < qpx_simd_width; i++)
+        {
+            simd_interaction_array[index + i] = (j & (1 << i)) ? simdTrue : simdFalse;
+        }
+    }
+    nbat->simd_interaction_array = simd_interaction_array;
+#endif
+}
+#endif
+
 /* Initializes an nbnxn_atomdata_t data structure */
 void nbnxn_atomdata_init(FILE *fp,
                          nbnxn_atomdata_t *nbat,
@@ -663,54 +747,7 @@ void nbnxn_atomdata_init(FILE *fp,
 #ifdef GMX_NBNXN_SIMD
     if (simple)
     {
-        /* Set the diagonal cluster pair interaction mask setup data.
-         * In the kernel we check 0 < j - i to generate the masks.
-         * Here we store j - i for generating the mask for the first i (i=0);
-         * we substract 0.5 to avoid rounding issues.
-         * In the kernel we can subtract 1 to generate the mask for the next i.
-         */
-        const int simd_width = GMX_SIMD_WIDTH_HERE;
-        int       simd_4xn_diag_ind_size, simd_interaction_size, j;
-
-        simd_4xn_diag_ind_size = max(NBNXN_CPU_CLUSTER_I_SIZE, simd_width);
-        snew_aligned(nbat->simd_4xn_diagonal_j_minus_i,
-                     simd_4xn_diag_ind_size, NBNXN_MEM_ALIGN);
-        for (j = 0; j < simd_4xn_diag_ind_size; j++)
-        {
-            nbat->simd_4xn_diagonal_j_minus_i[j] = j - 0.5;
-        }
-
-        snew_aligned(nbat->simd_2xnn_diagonal_j_minus_i,
-                     simd_width, NBNXN_MEM_ALIGN);
-        for (j = 0; j < simd_width/2; j++)
-        {
-            /* The j-cluster size is half the SIMD width */
-            nbat->simd_2xnn_diagonal_j_minus_i[j]              = j - 0.5;
-            /* The next half of the SIMD width is for i + 1 */
-            nbat->simd_2xnn_diagonal_j_minus_i[simd_width/2+j] = j - 1 - 0.5;
-        }
-
-        /* We use up to 32 bits for exclusion masking.
-         * The same masks are used for the 4xN and 2x(N+N) kernels.
-         * The masks are read either into epi32 SIMD registers or into
-         * real SIMD registers (together with a cast).
-         * In single precision this means the real and epi32 SIMD registers
-         * are of equal size.
-         * In double precision the epi32 registers can be smaller than
-         * the real registers, so depending on the architecture, we might
-         * need to use two, identical, 32-bit masks per real.
-         */
-        simd_interaction_size = NBNXN_CPU_CLUSTER_I_SIZE*simd_width;
-        snew_aligned(nbat->simd_exclusion_filter1, simd_interaction_size,   NBNXN_MEM_ALIGN);
-        snew_aligned(nbat->simd_exclusion_filter2, simd_interaction_size*2, NBNXN_MEM_ALIGN);
-        
-        for (j = 0; j < simd_interaction_size; j++)
-        {
-            /* Set the consecutive bits for filters pair exclusions masks */
-            nbat->simd_exclusion_filter1[j]       = (1U << j);
-            nbat->simd_exclusion_filter2[j*2 + 0] = (1U << j);
-            nbat->simd_exclusion_filter2[j*2 + 1] = (1U << j);
-        }
+        nbnxn_atomdata_init_simple_exclusion_masks(nbat);
     }
 #endif
 
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index f5e3e02f74..65f8fca137 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -601,7 +601,7 @@ void nbnxn_cuda_init(FILE *fplog,
     bTMPIAtomics = false;
 #endif
 
-#if defined(i386) || defined(__x86_64__)
+#ifdef GMX_IS_X86
     bX86 = true;
 #else
     bX86 = false;
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
index 0962146625..9fde8f06e7 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils.h
@@ -146,6 +146,15 @@ static const int nbfp_stride = 4;
 static const int nbfp_stride = GMX_SIMD_WIDTH_HERE;
 #endif
 
+/* We use the FDV0 table layout when we can use aligned table loads */
+#if GMX_SIMD_WIDTH_HERE == 4
+#define TAB_FDV0
+#endif
+
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+#include "nbnxn_kernel_simd_utils_ibm_qpx.h"
+#endif /* GMX_CPU_ACCELERATION_IBM_QPX */
+
 #endif /* GMX_X86_SSE2 */
 #endif /* GMX_SIMD_REFERENCE_PLAIN_C */
 
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h
new file mode 100644
index 0000000000..96faaf89c4
--- /dev/null
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ibm_qpx.h
@@ -0,0 +1,250 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
+ * Copyright (c) 2001-2012, The GROMACS Development Team
+ * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * David van der Spoel, Berk Hess, Erik Lindahl, and including many
+ * others, as listed in the AUTHORS file in the top-level source
+ * directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+#ifndef _nbnxn_kernel_simd_utils_ibm_qpx_h_
+#define _nbnxn_kernel_simd_utils_ibm_qpx_h_
+
+typedef gmx_mm_pr gmx_exclfilter;
+static const int filter_stride = 1;
+
+/* The 4xn kernel operates on 4-wide i-force registers */
+typedef gmx_mm_pr gmx_mm_pr4;
+
+/* This files contains all functions/macros for the SIMD kernels
+ * which have explicit dependencies on the j-cluster size and/or SIMD-width.
+ * The functionality which depends on the j-cluster size is:
+ *   LJ-parameter lookup
+ *   force table lookup
+ *   energy group pair energy storage
+ */
+
+/* Collect all [0123] elements of the 4 inputs to out[0123], respectively */
+static gmx_inline void
+gmx_transpose_4_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d,
+                   gmx_mm_pr *out0, gmx_mm_pr *out1,
+                   gmx_mm_pr *out2, gmx_mm_pr *out3)
+{
+    /* Prepare control vectors for swizzling. In its third input,
+       vec_perm accepts indices into the effective 8-wide SIMD vector
+       created by concatenating its first two inputs. Those indices
+       map data from the input vectors to the output vector.
+
+       vec_gpci() converts an octal literal of the indices into the
+       correct form for vec_perm() to use. That form is an octal digit
+       in bits 0-2 of the mantissa of each double. */
+    gmx_mm_pr p6420 = vec_gpci(06420);
+    gmx_mm_pr p7531 = vec_gpci(07531);
+
+    /* Four-way swizzle (i.e. transpose) of vectors a = a0a1a2a3, etc. */
+    gmx_mm_pr b2b0a2a0 = vec_perm(a, b, p6420);
+    gmx_mm_pr b3b1a3a1 = vec_perm(a, b, p7531);
+    gmx_mm_pr d2d0c2c0 = vec_perm(c, d, p6420);
+    gmx_mm_pr d3d1c3c1 = vec_perm(c, d, p7531);
+    *out0 = vec_perm(d2d0c2c0, b2b0a2a0, p7531);
+    *out1 = vec_perm(d3d1c3c1, b3b1a3a1, p7531);
+    *out2 = vec_perm(d2d0c2c0, b2b0a2a0, p6420);
+    *out3 = vec_perm(d3d1c3c1, b3b1a3a1, p6420);
+}
+
+/* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
+static gmx_inline void
+gmx_shuffle_4_ps_fil01_to_2_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d,
+                               gmx_mm_pr *out0, gmx_mm_pr *out1)
+{
+    gmx_mm_pr p6420 = vec_gpci(06420);
+    gmx_mm_pr p7531 = vec_gpci(07531);
+
+    /* Partial four-way swizzle of vectors a = a0a1a2a3, etc. */
+    gmx_mm_pr b2b0a2a0 = vec_perm(a, b, p6420);
+    gmx_mm_pr b3b1a3a1 = vec_perm(a, b, p7531);
+    gmx_mm_pr d2d0c2c0 = vec_perm(c, d, p6420);
+    gmx_mm_pr d3d1c3c1 = vec_perm(c, d, p7531);
+    *out0 = vec_perm(d2d0c2c0, b2b0a2a0, p7531);
+    *out1 = vec_perm(d3d1c3c1, b3b1a3a1, p7531);
+}
+
+/* Collect element 2 of the 4 inputs to out */
+static gmx_inline gmx_mm_pr
+gmx_shuffle_4_ps_fil2_to_1_ps(gmx_mm_pr a, gmx_mm_pr b, gmx_mm_pr c, gmx_mm_pr d)
+{
+    gmx_mm_pr p6420 = vec_gpci(06420);
+
+    /* Partial four-way swizzle of vectors a = a0a1a2a3, etc. */
+    gmx_mm_pr b2b0a2a0 = vec_perm(a, b, p6420);
+    gmx_mm_pr d2d0c2c0 = vec_perm(c, d, p6420);
+    return vec_perm(d2d0c2c0, b2b0a2a0, p6420);
+}
+
+#ifdef TAB_FDV0
+/* Align a stack-based thread-local working array. Table loads on QPX
+ * use the array, but most other implementations do not. */
+static gmx_inline int *
+prepare_table_load_buffer(const int *array)
+{
+    return gmx_simd_align_int(array);
+}
+
+static gmx_inline void
+load_table_f(const real *tab_coul_FDV0, gmx_epi32 ti_S, int *ti,
+             gmx_mm_pr *ctab0_S, gmx_mm_pr *ctab1_S)
+{
+#ifdef NDEBUG
+    /* Just like 256-bit AVX, we need to use memory to get indices
+       into integer registers efficiently. */
+    vec_st(ti_S, 0, ti);
+#else
+    vec_sta(ti_S, 0, ti);
+#endif
+
+    /* Here we load 4 aligned reals, but we need just 2 elements of each */
+    gmx_mm_pr a = gmx_load_pr(tab_coul_FDV0 + ti[0] * nbfp_stride);
+    gmx_mm_pr b = gmx_load_pr(tab_coul_FDV0 + ti[1] * nbfp_stride);
+    gmx_mm_pr c = gmx_load_pr(tab_coul_FDV0 + ti[2] * nbfp_stride);
+    gmx_mm_pr d = gmx_load_pr(tab_coul_FDV0 + ti[3] * nbfp_stride);
+
+    gmx_shuffle_4_ps_fil01_to_2_ps(a, b, c, d, ctab0_S, ctab1_S);
+}
+
+static gmx_inline void
+load_table_f_v(const real *tab_coul_FDV0,
+               gmx_epi32 ti_S, int *ti,
+               gmx_mm_pr *ctab0_S, gmx_mm_pr *ctab1_S,
+               gmx_mm_pr *ctabv_S)
+{
+#ifdef NDEBUG
+    /* Just like 256-bit AVX, we need to use memory to get indices
+       into integer registers efficiently. */
+    vec_st(ti_S, 0, ti);
+#else
+    vec_sta(ti_S, 0, ti);
+#endif
+
+    /* Here we load 4 aligned reals, but we need just 3 elements of each. */
+    gmx_mm_pr a = gmx_load_pr(tab_coul_FDV0 + ti[0] * nbfp_stride);
+    gmx_mm_pr b = gmx_load_pr(tab_coul_FDV0 + ti[1] * nbfp_stride);
+    gmx_mm_pr c = gmx_load_pr(tab_coul_FDV0 + ti[2] * nbfp_stride);
+    gmx_mm_pr d = gmx_load_pr(tab_coul_FDV0 + ti[3] * nbfp_stride);
+
+    gmx_shuffle_4_ps_fil01_to_2_ps(a, b, c, d, ctab0_S, ctab1_S);
+    *ctabv_S = gmx_shuffle_4_ps_fil2_to_1_ps(a, b, c, d);
+}
+#else
+
+/* Not required for BlueGene/Q */
+
+#endif
+
+/* Sum the elements within each input register and store the sums in out.
+ */
+static gmx_inline gmx_mm_pr
+gmx_mm_transpose_sum4_pr(gmx_mm_pr a, gmx_mm_pr b,
+                         gmx_mm_pr c, gmx_mm_pr d)
+{
+    gmx_mm_pr a0b0c0d0, a1b1c1d1, a2b2c2d2, a3b3c3d3;
+    gmx_transpose_4_ps(a, b, c, d,
+                       &a0b0c0d0,
+                       &a1b1c1d1,
+                       &a2b2c2d2,
+                       &a3b3c3d3);
+    /* Now reduce the transposed vectors */
+    gmx_mm_pr sum01 = gmx_add_pr(a0b0c0d0, a1b1c1d1);
+    gmx_mm_pr sim23 = gmx_add_pr(a2b2c2d2, a3b3c3d3);
+    return gmx_add_pr(sum01, sim23);
+}
+
+#ifdef GMX_DOUBLE
+/* In double precision on x86 it can be faster to first calculate
+ * single precision square roots for two double precision registers at
+ * once and then use double precision Newton-Raphson iteration to
+ * reach full double precision. For QPX, we just wrap the usual
+ * reciprocal square roots.
+ */
+static gmx_inline void
+gmx_mm_invsqrt2_pd(gmx_mm_pr in0, gmx_mm_pr in1,
+                   gmx_mm_pr *out0, gmx_mm_pr *out1)
+{
+    *out0 = gmx_invsqrt_pr(in0);
+    *out1 = gmx_invsqrt_pr(in1);
+}
+#endif
+
+static gmx_inline void
+load_lj_pair_params(const real *nbfp, const int *type, int aj,
+                    gmx_mm_pr *c6_S, gmx_mm_pr *c12_S)
+{
+    /* Here we load 4 aligned reals, but we need just 2 elemnts of each. */
+    gmx_mm_pr a = gmx_load_pr(nbfp + type[aj+0] * nbfp_stride);
+    gmx_mm_pr b = gmx_load_pr(nbfp + type[aj+1] * nbfp_stride);
+    gmx_mm_pr c = gmx_load_pr(nbfp + type[aj+2] * nbfp_stride);
+    gmx_mm_pr d = gmx_load_pr(nbfp + type[aj+3] * nbfp_stride);
+
+    gmx_shuffle_4_ps_fil01_to_2_ps(a, b, c, d, c6_S, c12_S);
+}
+
+/* Define USE_FUNCTIONS_FOR_QPX to get the static inline functions
+ * that seem to exhaust xlC 12.1 during kernel compilation */
+
+#ifndef USE_FUNCTIONS_FOR_QPX
+
+#define gmx_load_exclusion_filter(a) vec_ldia(0, (int *) a)
+#define gmx_load_interaction_mask_pb(a, b) vec_ld(a, (real *) b)
+
+#else /* USE_FUNCTIONS_FOR_QPX */
+
+static gmx_inline gmx_exclfilter gmx_load_exclusion_filter(const unsigned *a)
+{
+#ifdef NDEBUG
+    return vec_ldia(0, (int *) a);
+#else
+    return vec_ldiaa(0, (int *) a);
+#endif
+}
+
+/* Code for handling loading and applying exclusion masks. Note that
+   parameter a is not treated like an array index; it is naively added
+   to b, so should be in bytes. */
+static gmx_inline gmx_mm_pb gmx_load_interaction_mask_pb(long a, const real *b)
+{
+#ifdef NDEBUG
+    return vec_ld(a, (real *) b);
+#else
+    return vec_lda(a, (real *) b);
+#endif
+}
+
+#endif /* USE_FUNCTIONS_FOR_QPX */
+
+#endif /* _nbnxn_kernel_simd_utils_ibm_qpx_h_ */
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h
index ac7ddec9b9..1d0d10e9ee 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_ref.h
@@ -37,10 +37,20 @@
 #ifndef _nbnxn_kernel_simd_utils_ref_h_
 #define _nbnxn_kernel_simd_utils_ref_h_
 
-typedef gmx_simd_ref_epi32            gmx_simd_ref_exclfilter;
-#define gmx_exclfilter                gmx_simd_ref_exclfilter
+typedef gmx_simd_ref_epi32      gmx_simd_ref_exclfilter;
+typedef gmx_simd_ref_exclfilter gmx_exclfilter;
 static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
 
+/* Set the stride for the lookup of the two LJ parameters from their
+   (padded) array. Only strides of 2 and 4 are currently supported. */
+#if defined GMX_NBNXN_SIMD_2XNN
+static const int nbfp_stride = 4;
+#elif defined GMX_DOUBLE
+static const int nbfp_stride = 2;
+#else
+static const int nbfp_stride = 4;
+#endif
+
 #if GMX_SIMD_WIDTH_HERE > 4
 /* The 4xn kernel operates on 4-wide i-force registers */
 
@@ -89,6 +99,10 @@ gmx_add_pr4(gmx_mm_pr4 a, gmx_mm_pr4 b)
     return c;
 }
 
+#else
+
+typedef gmx_simd_ref_pr gmx_simd_ref_pr4;
+
 #endif
 
 
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h
index a2a65f91b2..ac6ce836a3 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128d.h
@@ -45,7 +45,7 @@
  *   energy group pair energy storage
  */
 
-#define gmx_exclfilter gmx_epi32
+typedef gmx_epi32 gmx_exclfilter;
 static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
 
 /* Transpose 2 double precision registers */
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h
index 6eea30fd1f..a0a37adbff 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_128s.h
@@ -45,7 +45,7 @@
  *   energy group pair energy storage
  */
 
-#define gmx_exclfilter gmx_epi32
+typedef gmx_epi32 gmx_exclfilter;
 static const int filter_stride = GMX_SIMD_EPI32_WIDTH/GMX_SIMD_WIDTH_HERE;
 
 /* Collect element 0 and 1 of the 4 inputs to out0 and out1, respectively */
@@ -81,7 +81,7 @@ gmx_mm_transpose_sum4_pr(__m128 in0, __m128 in1,
     _MM_TRANSPOSE4_PS(in0, in1, in2, in3);
     in0  = _mm_add_ps(in0, in1);
     in2  = _mm_add_ps(in2, in3);
-    
+
     return _mm_add_ps(in0, in2);
 }
 
@@ -91,7 +91,7 @@ load_lj_pair_params(const real *nbfp, const int *type, int aj,
 {
     __m128 clj_S[UNROLLJ];
     int    p;
-    
+
     for (p = 0; p < UNROLLJ; p++)
     {
         /* Here we load 4 aligned floats, but we need just 2 */
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h
index 93ff74f6e5..e4aab86329 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256d.h
@@ -45,7 +45,7 @@
  *   energy group pair energy storage
  */
 
-#define gmx_exclfilter gmx_mm_pr
+typedef gmx_mm_pr gmx_exclfilter;
 static const int filter_stride = 2;
 
 /* Transpose 2 double precision registers */
@@ -65,7 +65,7 @@ gmx_mm_transpose_sum4_pr(__m256d in0, __m256d in1,
     in0 = _mm256_hadd_pd(in0, in1);
     in2 = _mm256_hadd_pd(in2, in3);
 
-    return _mm256_add_pd(_mm256_permute2f128_pd(in0, in2, 0x20), _mm256_permute2f128_pd(in0, in2, 0x31)); 
+    return _mm256_add_pd(_mm256_permute2f128_pd(in0, in2, 0x20), _mm256_permute2f128_pd(in0, in2, 0x31));
 }
 
 static gmx_inline __m256
@@ -106,7 +106,7 @@ gmx_mm_invsqrt2_pd(__m256d in0, __m256d in1,
     __m256        s, ir;
     __m256d       lu0, lu1;
 
-    s    =  gmx_2_m128_to_m256(_mm256_cvtpd_ps(in0), _mm256_cvtpd_ps(in1));
+    s     =  gmx_2_m128_to_m256(_mm256_cvtpd_ps(in0), _mm256_cvtpd_ps(in1));
     ir    = gmx_mm256_invsqrt_ps_single(s);
     lu0   = _mm256_cvtps_pd(_mm256_castps256_ps128(ir));
     lu1   = _mm256_cvtps_pd(_mm256_extractf128_ps(ir, 1));
diff --git a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h
index 9a31c5c7cf..908f9ba795 100644
--- a/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h
+++ b/src/mdlib/nbnxn_kernels/nbnxn_kernel_simd_utils_x86_256s.h
@@ -45,7 +45,7 @@
  *   energy group pair energy storage
  */
 
-#define gmx_exclfilter gmx_mm_pr
+typedef gmx_mm_pr gmx_exclfilter;
 static const int filter_stride = 1;
 
 /* The 4xn kernel operates on 4-wide i-force registers */
diff --git a/src/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h b/src/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h
index e03e9015bd..fdf53b39b8 100644
--- a/src/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h
+++ b/src/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_common.h
@@ -39,6 +39,9 @@
 #include "maths.h"
 #endif
 
+#ifndef GMX_SIMD_J_UNROLL_SIZE
+#error "Need to define GMX_SIMD_J_UNROLL_SIZE before including the 2xnn kernel common header file"
+#endif
 
 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
 
diff --git a/src/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h b/src/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
index a5afe2f250..93c0e8926e 100644
--- a/src/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
+++ b/src/mdlib/nbnxn_kernels/simd_2xnn/nbnxn_kernel_simd_2xnn_inner.h
@@ -251,7 +251,9 @@
     ajz           = ajy + STRIDE;
 
 #ifdef CHECK_EXCLS
-    gmx_load_simd_2xnn_interactions(l_cj[cjind].excl, filter_S0, filter_S2, &interact_S0, &interact_S2);
+    gmx_load_simd_2xnn_interactions(l_cj[cjind].excl,
+                                    filter_S0, filter_S2,
+                                    &interact_S0, &interact_S2);
 #endif /* CHECK_EXCLS */
 
     /* load j atom coordinates */
@@ -694,24 +696,28 @@
 #endif /* CALC_ENERGIES */
 
 #ifdef CALC_LJ
-    fscal_S0    = gmx_mul_pr(rinvsq_S0,
 #ifdef CALC_COULOMB
+    fscal_S0    = gmx_mul_pr(rinvsq_S0,
                              gmx_add_pr(frcoul_S0,
+                                        gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
 #else
+    fscal_S0    = gmx_mul_pr(rinvsq_S0,
                              (
-#endif
                                         gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
+#endif
 #else
     fscal_S0    = gmx_mul_pr(rinvsq_S0, frcoul_S0);
 #endif /* CALC_LJ */
 #if defined CALC_LJ && !defined HALF_LJ
-    fscal_S2    = gmx_mul_pr(rinvsq_S2,
 #ifdef CALC_COULOMB
+    fscal_S2    = gmx_mul_pr(rinvsq_S2,
                              gmx_add_pr(frcoul_S2,
+                                        gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
 #else
+    fscal_S2    = gmx_mul_pr(rinvsq_S2,
                              (
-#endif
                                         gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
+#endif
 #else
     /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
     fscal_S2    = gmx_mul_pr(rinvsq_S2, frcoul_S2);
diff --git a/src/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h b/src/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h
index 4dd896ca82..b213175142 100644
--- a/src/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h
+++ b/src/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_common.h
@@ -39,6 +39,10 @@
 #include "maths.h"
 #endif
 
+#ifndef GMX_SIMD_J_UNROLL_SIZE
+#error "Need to define GMX_SIMD_J_UNROLL_SIZE before including the 4xn kernel common header file"
+#endif
+
 #define SUM_SIMD4(x) (x[0]+x[1]+x[2]+x[3])
 
 #define UNROLLI    NBNXN_CPU_CLUSTER_I_SIZE
@@ -73,17 +77,28 @@ gmx_load_simd_4xn_interactions(int            excl,
                                gmx_exclfilter filter_S1,
                                gmx_exclfilter filter_S2,
                                gmx_exclfilter filter_S3,
+                               const char    *interaction_mask_indices,
+                               real          *simd_interaction_array,
                                gmx_mm_pb     *interact_S0,
                                gmx_mm_pb     *interact_S1,
                                gmx_mm_pb     *interact_S2,
                                gmx_mm_pb     *interact_S3)
 {
+#ifdef GMX_X86_SSE2
     /* Load integer interaction mask */
     gmx_exclfilter mask_pr_S = gmx_load1_exclfilter(excl);
     *interact_S0  = gmx_checkbitmask_pb(mask_pr_S, filter_S0);
     *interact_S1  = gmx_checkbitmask_pb(mask_pr_S, filter_S1);
     *interact_S2  = gmx_checkbitmask_pb(mask_pr_S, filter_S2);
     *interact_S3  = gmx_checkbitmask_pb(mask_pr_S, filter_S3);
+#endif
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+    const int size = GMX_SIMD_WIDTH_HERE * sizeof(real);
+    *interact_S0  = gmx_load_interaction_mask_pb(size*interaction_mask_indices[0], simd_interaction_array);
+    *interact_S1  = gmx_load_interaction_mask_pb(size*interaction_mask_indices[1], simd_interaction_array);
+    *interact_S2  = gmx_load_interaction_mask_pb(size*interaction_mask_indices[2], simd_interaction_array);
+    *interact_S3  = gmx_load_interaction_mask_pb(size*interaction_mask_indices[3], simd_interaction_array);
+#endif
 }
 
 /* All functionality defines are set here, except for:
diff --git a/src/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h b/src/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
index 9465c463bf..26c171f138 100644
--- a/src/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
+++ b/src/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_inner.h
@@ -283,7 +283,20 @@
     ajz           = ajy + STRIDE;
 
 #ifdef CHECK_EXCLS
-    gmx_load_simd_4xn_interactions(l_cj[cjind].excl, filter_S0, filter_S1, filter_S2, filter_S3, &interact_S0, &interact_S1, &interact_S2, &interact_S3);
+    gmx_load_simd_4xn_interactions(l_cj[cjind].excl,
+                                   filter_S0, filter_S1,
+                                   filter_S2, filter_S3,
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+                                   l_cj[cjind].interaction_mask_indices,
+                                   nbat->simd_interaction_array,
+#else
+                                   /* The struct fields do not exist
+                                      except on BlueGene/Q */
+                                   NULL,
+                                   NULL,
+#endif
+                                   &interact_S0, &interact_S1,
+                                   &interact_S2, &interact_S3);
 #endif /* CHECK_EXCLS */
 
     /* load j atom coordinates */
@@ -867,13 +880,15 @@
 #endif
 #endif
 #ifndef ENERGY_GROUPS
-    Vvdwtot_S   = gmx_add_pr(Vvdwtot_S,
 #ifndef HALF_LJ
+    Vvdwtot_S   = gmx_add_pr(Vvdwtot_S,
                              gmx_sum4_pr(VLJ_S0, VLJ_S1, VLJ_S2, VLJ_S3)
+                             );
 #else
+    Vvdwtot_S   = gmx_add_pr(Vvdwtot_S,
                              gmx_add_pr(VLJ_S0, VLJ_S1)
-#endif
                              );
+#endif
 #else
     add_ener_grp(VLJ_S0, vvdwtp[0], egp_jj);
     add_ener_grp(VLJ_S1, vvdwtp[1], egp_jj);
@@ -886,39 +901,47 @@
 #endif /* CALC_ENERGIES */
 
 #ifdef CALC_LJ
-    fscal_S0    = gmx_mul_pr(rinvsq_S0,
 #ifdef CALC_COULOMB
+    fscal_S0    = gmx_mul_pr(rinvsq_S0,
                              gmx_add_pr(frcoul_S0,
+                                        gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
 #else
+    fscal_S0    = gmx_mul_pr(rinvsq_S0,
                              (
-#endif
                                         gmx_sub_pr(FrLJ12_S0, FrLJ6_S0)));
-    fscal_S1    = gmx_mul_pr(rinvsq_S1,
+#endif
 #ifdef CALC_COULOMB
+    fscal_S1    = gmx_mul_pr(rinvsq_S1,
                              gmx_add_pr(frcoul_S1,
+                                        gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
 #else
+    fscal_S1    = gmx_mul_pr(rinvsq_S1,
                              (
-#endif
                                         gmx_sub_pr(FrLJ12_S1, FrLJ6_S1)));
+#endif
 #else
     fscal_S0    = gmx_mul_pr(rinvsq_S0, frcoul_S0);
     fscal_S1    = gmx_mul_pr(rinvsq_S1, frcoul_S1);
 #endif /* CALC_LJ */
 #if defined CALC_LJ && !defined HALF_LJ
-    fscal_S2    = gmx_mul_pr(rinvsq_S2,
 #ifdef CALC_COULOMB
+    fscal_S2    = gmx_mul_pr(rinvsq_S2,
                              gmx_add_pr(frcoul_S2,
+                                        gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
 #else
+    fscal_S2    = gmx_mul_pr(rinvsq_S2,
                              (
-#endif
                                         gmx_sub_pr(FrLJ12_S2, FrLJ6_S2)));
-    fscal_S3    = gmx_mul_pr(rinvsq_S3,
+#endif
 #ifdef CALC_COULOMB
+    fscal_S3    = gmx_mul_pr(rinvsq_S3,
                              gmx_add_pr(frcoul_S3,
+                                        gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
 #else
+    fscal_S3    = gmx_mul_pr(rinvsq_S3,
                              (
-#endif
                                         gmx_sub_pr(FrLJ12_S3, FrLJ6_S3)));
+#endif
 #else
     /* Atom 2 and 3 don't have LJ, so only add Coulomb forces */
     fscal_S2    = gmx_mul_pr(rinvsq_S2, frcoul_S2);
diff --git a/src/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h b/src/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h
index 884577547b..2fe56cf5b3 100644
--- a/src/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h
+++ b/src/mdlib/nbnxn_kernels/simd_4xn/nbnxn_kernel_simd_4xn_outer.h
@@ -75,10 +75,10 @@
 #if UNROLLJ >= 4
     /* We use an i-force SIMD register width of 4 */
 #if UNROLLJ == 4
-#define gmx_mm_pr4     gmx_mm_pr
-#define gmx_load_pr4   gmx_load_pr
-#define gmx_store_pr4  gmx_store_pr
-#define gmx_add_pr4    gmx_add_pr
+#define gmx_mm_pr4    gmx_mm_pr
+#define gmx_load_pr4  gmx_load_pr
+#define gmx_store_pr4 gmx_store_pr
+#define gmx_add_pr4   gmx_add_pr
 #else
     /* The pr4 stuff is defined in nbnxn_kernel_simd_utils.h */
 #endif
@@ -100,16 +100,16 @@
     unsigned      *exclusion_filter;
     gmx_exclfilter filter_S0, filter_S1, filter_S2, filter_S3;
 
-    gmx_mm_pr  zero_S = gmx_set1_pr(0);
+    gmx_mm_pr      zero_S = gmx_set1_pr(0.0);
 
-    gmx_mm_pr  one_S  = gmx_set1_pr(1.0);
-    gmx_mm_pr  iq_S0  = gmx_setzero_pr();
-    gmx_mm_pr  iq_S1  = gmx_setzero_pr();
-    gmx_mm_pr  iq_S2  = gmx_setzero_pr();
-    gmx_mm_pr  iq_S3  = gmx_setzero_pr();
-    gmx_mm_pr  mrc_3_S;
+    gmx_mm_pr      one_S  = gmx_set1_pr(1.0);
+    gmx_mm_pr      iq_S0  = gmx_setzero_pr();
+    gmx_mm_pr      iq_S1  = gmx_setzero_pr();
+    gmx_mm_pr      iq_S2  = gmx_setzero_pr();
+    gmx_mm_pr      iq_S3  = gmx_setzero_pr();
+    gmx_mm_pr      mrc_3_S;
 #ifdef CALC_ENERGIES
-    gmx_mm_pr  hrc_3_S, moh_rc_S;
+    gmx_mm_pr      hrc_3_S, moh_rc_S;
 #endif
 
 #ifdef CALC_COUL_TAB
diff --git a/src/mdlib/nbnxn_search.c b/src/mdlib/nbnxn_search.c
index ac23d9bc59..367a17a80e 100644
--- a/src/mdlib/nbnxn_search.c
+++ b/src/mdlib/nbnxn_search.c
@@ -803,8 +803,8 @@ static void calc_bounding_box_x_x4_halves(int na, const real *x,
          * so we don't need to treat special cases in the rest of the code.
          */
 #ifdef NBNXN_SEARCH_BB_SIMD4
-        gmx_simd4_store_pr(&bbj[1].lower[0], gmx_simd4_load_pr(&bbj[0].lower[0]));
-        gmx_simd4_store_pr(&bbj[1].upper[0], gmx_simd4_load_pr(&bbj[0].upper[0]));
+        gmx_simd4_store_pr(&bbj[1].lower[0], gmx_simd4_load_bb_pr(&bbj[0].lower[0]));
+        gmx_simd4_store_pr(&bbj[1].upper[0], gmx_simd4_load_bb_pr(&bbj[0].upper[0]));
 #else
         bbj[1] = bbj[0];
 #endif
@@ -812,11 +812,11 @@ static void calc_bounding_box_x_x4_halves(int na, const real *x,
 
 #ifdef NBNXN_SEARCH_BB_SIMD4
     gmx_simd4_store_pr(&bb->lower[0],
-                       gmx_simd4_min_pr(gmx_simd4_load_pr(&bbj[0].lower[0]),
-                                        gmx_simd4_load_pr(&bbj[1].lower[0])));
+                       gmx_simd4_min_pr(gmx_simd4_load_bb_pr(&bbj[0].lower[0]),
+                                        gmx_simd4_load_bb_pr(&bbj[1].lower[0])));
     gmx_simd4_store_pr(&bb->upper[0],
-                       gmx_simd4_max_pr(gmx_simd4_load_pr(&bbj[0].upper[0]),
-                                        gmx_simd4_load_pr(&bbj[1].upper[0])));
+                       gmx_simd4_max_pr(gmx_simd4_load_bb_pr(&bbj[0].upper[0]),
+                                        gmx_simd4_load_bb_pr(&bbj[1].upper[0])));
 #else
     {
         int i;
@@ -877,12 +877,12 @@ static void calc_bounding_box_simd4(int na, const float *x, nbnxn_bb_t *bb)
 
     int    i;
 
-    bb_0_S = gmx_simd4_load_pr(x);
+    bb_0_S = gmx_simd4_load_bb_pr(x);
     bb_1_S = bb_0_S;
 
     for (i = 1; i < na; i++)
     {
-        x_S    = gmx_simd4_load_pr(x+i*NNBSBB_C);
+        x_S    = gmx_simd4_load_bb_pr(x+i*NNBSBB_C);
         bb_0_S = gmx_simd4_min_pr(bb_0_S, x_S);
         bb_1_S = gmx_simd4_max_pr(bb_1_S, x_S);
     }
@@ -925,10 +925,10 @@ static void combine_bounding_box_pairs(nbnxn_grid_t *grid, const nbnxn_bb_t *bb)
 #ifdef NBNXN_SEARCH_BB_SIMD4
             gmx_simd4_pr min_S, max_S;
 
-            min_S = gmx_simd4_min_pr(gmx_simd4_load_pr(&bb[c2*2+0].lower[0]),
-                                     gmx_simd4_load_pr(&bb[c2*2+1].lower[0]));
-            max_S = gmx_simd4_max_pr(gmx_simd4_load_pr(&bb[c2*2+0].upper[0]),
-                                     gmx_simd4_load_pr(&bb[c2*2+1].upper[0]));
+            min_S = gmx_simd4_min_pr(gmx_simd4_load_bb_pr(&bb[c2*2+0].lower[0]),
+                                     gmx_simd4_load_bb_pr(&bb[c2*2+1].lower[0]));
+            max_S = gmx_simd4_max_pr(gmx_simd4_load_bb_pr(&bb[c2*2+0].upper[0]),
+                                     gmx_simd4_load_bb_pr(&bb[c2*2+1].upper[0]));
             gmx_simd4_store_pr(&grid->bbj[c2].lower[0], min_S);
             gmx_simd4_store_pr(&grid->bbj[c2].upper[0], max_S);
 #else
@@ -2077,10 +2077,10 @@ static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,
     gmx_simd4_pr dm_S;
     gmx_simd4_pr dm0_S;
 
-    bb_i_S0 = gmx_simd4_load_pr(&bb_i_ci[si].lower[0]);
-    bb_i_S1 = gmx_simd4_load_pr(&bb_i_ci[si].upper[0]);
-    bb_j_S0 = gmx_simd4_load_pr(&bb_j_all[csj].lower[0]);
-    bb_j_S1 = gmx_simd4_load_pr(&bb_j_all[csj].upper[0]);
+    bb_i_S0 = gmx_simd4_load_bb_pr(&bb_i_ci[si].lower[0]);
+    bb_i_S1 = gmx_simd4_load_bb_pr(&bb_i_ci[si].upper[0]);
+    bb_j_S0 = gmx_simd4_load_bb_pr(&bb_j_all[csj].lower[0]);
+    bb_j_S1 = gmx_simd4_load_bb_pr(&bb_j_all[csj].upper[0]);
 
     dl_S    = gmx_simd4_sub_pr(bb_i_S0, bb_j_S1);
     dh_S    = gmx_simd4_sub_pr(bb_j_S0, bb_i_S1);
@@ -2107,12 +2107,12 @@ static float subc_bb_dist2_simd4(int si, const nbnxn_bb_t *bb_i_ci,
                                                  \
         shi = si*NNBSBB_D*DIM;                       \
                                                  \
-        xi_l = gmx_simd4_load_pr(bb_i+shi+0*STRIDE_PBB);   \
-        yi_l = gmx_simd4_load_pr(bb_i+shi+1*STRIDE_PBB);   \
-        zi_l = gmx_simd4_load_pr(bb_i+shi+2*STRIDE_PBB);   \
-        xi_h = gmx_simd4_load_pr(bb_i+shi+3*STRIDE_PBB);   \
-        yi_h = gmx_simd4_load_pr(bb_i+shi+4*STRIDE_PBB);   \
-        zi_h = gmx_simd4_load_pr(bb_i+shi+5*STRIDE_PBB);   \
+        xi_l = gmx_simd4_load_bb_pr(bb_i+shi+0*STRIDE_PBB);   \
+        yi_l = gmx_simd4_load_bb_pr(bb_i+shi+1*STRIDE_PBB);   \
+        zi_l = gmx_simd4_load_bb_pr(bb_i+shi+2*STRIDE_PBB);   \
+        xi_h = gmx_simd4_load_bb_pr(bb_i+shi+3*STRIDE_PBB);   \
+        yi_h = gmx_simd4_load_bb_pr(bb_i+shi+4*STRIDE_PBB);   \
+        zi_h = gmx_simd4_load_bb_pr(bb_i+shi+5*STRIDE_PBB);   \
                                                  \
         dx_0 = gmx_simd4_sub_pr(xi_l, xj_h);                \
         dy_0 = gmx_simd4_sub_pr(yi_l, yj_h);                \
@@ -2237,12 +2237,12 @@ static gmx_bool subc_in_range_simd4(int na_c,
     rc2_S   = gmx_simd4_set1_pr(rl2);
 
     dim_stride = NBNXN_GPU_CLUSTER_SIZE/STRIDE_PBB*DIM;
-    ix_S0      = gmx_simd4_load_pr(x_i+(si*dim_stride+0)*STRIDE_PBB);
-    iy_S0      = gmx_simd4_load_pr(x_i+(si*dim_stride+1)*STRIDE_PBB);
-    iz_S0      = gmx_simd4_load_pr(x_i+(si*dim_stride+2)*STRIDE_PBB);
-    ix_S1      = gmx_simd4_load_pr(x_i+(si*dim_stride+3)*STRIDE_PBB);
-    iy_S1      = gmx_simd4_load_pr(x_i+(si*dim_stride+4)*STRIDE_PBB);
-    iz_S1      = gmx_simd4_load_pr(x_i+(si*dim_stride+5)*STRIDE_PBB);
+    ix_S0      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+0)*STRIDE_PBB);
+    iy_S0      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+1)*STRIDE_PBB);
+    iz_S0      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+2)*STRIDE_PBB);
+    ix_S1      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+3)*STRIDE_PBB);
+    iy_S1      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+4)*STRIDE_PBB);
+    iz_S1      = gmx_simd4_load_bb_pr(x_i+(si*dim_stride+5)*STRIDE_PBB);
 
     /* We loop from the outer to the inner particles to maximize
      * the chance that we find a pair in range quickly and return.
@@ -3190,6 +3190,12 @@ static void set_ci_top_excls(const nbnxn_search_t nbs,
                         inner_e = ge - (se << na_cj_2log);
 
                         nbl->cj[found].excl &= ~(1U<<((inner_i<<na_cj_2log) + inner_e));
+/* The next code line is usually not needed. We do not want to version
+ * away the above line, because there is logic that relies on being
+ * able to detect easily whether any exclusions exist. */
+#if (defined GMX_CPU_ACCELERATION_IBM_QPX)
+                        nbl->cj[found].interaction_mask_indices[inner_i] &= ~(1U << inner_e);
+#endif
                     }
                 }
             }
diff --git a/src/mdlib/nbnxn_search.h b/src/mdlib/nbnxn_search.h
index 48cdafd277..1898257b55 100644
--- a/src/mdlib/nbnxn_search.h
+++ b/src/mdlib/nbnxn_search.h
@@ -45,7 +45,6 @@
 extern "C" {
 #endif
 
-
 /* Returns the j-cluster size for kernel of type nb_kernel_type */
 int nbnxn_kernel_to_cj_size(int nb_kernel_type);
 
diff --git a/src/mdlib/nbnxn_search_simd_4xn.h b/src/mdlib/nbnxn_search_simd_4xn.h
index 9d5c638c43..4fd4129fc0 100644
--- a/src/mdlib/nbnxn_search_simd_4xn.h
+++ b/src/mdlib/nbnxn_search_simd_4xn.h
@@ -266,6 +266,12 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
             /* Store cj and the interaction mask */
             nbl->cj[nbl->ncj].cj   = CI_TO_CJ_SIMD_4XN(gridj->cell0) + cj;
             nbl->cj[nbl->ncj].excl = get_imask_simd_4xn(remove_sub_diag, ci, cj);
+#ifdef GMX_CPU_ACCELERATION_IBM_QPX
+            nbl->cj[nbl->ncj].interaction_mask_indices[0] = (nbl->cj[nbl->ncj].excl & 0x000F) >> (0 * 4);
+            nbl->cj[nbl->ncj].interaction_mask_indices[1] = (nbl->cj[nbl->ncj].excl & 0x00F0) >> (1 * 4);
+            nbl->cj[nbl->ncj].interaction_mask_indices[2] = (nbl->cj[nbl->ncj].excl & 0x0F00) >> (2 * 4);
+            nbl->cj[nbl->ncj].interaction_mask_indices[3] = (nbl->cj[nbl->ncj].excl & 0xF000) >> (3 * 4);
+#endif
             nbl->ncj++;
         }
         /* Increase the closing index in i super-cell list */
@@ -274,4 +280,3 @@ make_cluster_list_simd_4xn(const nbnxn_grid_t *gridj,
 }
 
 #undef STRIDE_S
-
-- 
2.22.0